diff --git a/apps/maprobe/Makefile b/apps/maprobe/Makefile index c36a3029bc23756a1bee6417d9c3083c8b3fe60b..663c48ebef40f185ec9e528b55d95c76c9039367 100644 --- a/apps/maprobe/Makefile +++ b/apps/maprobe/Makefile @@ -1,3 +1,3 @@ NAME = maprobe -SRCS = maprobe.c +SRCS = common.c bitutils.c resultmat.c latency-test.c bandwidth-test.c replacement-test.c main.c include $(AM_HOME)/Makefile.app diff --git a/apps/maprobe/README.md b/apps/maprobe/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e6f712220831019e2bb7a24bafd69022642c266b --- /dev/null +++ b/apps/maprobe/README.md @@ -0,0 +1,3 @@ +# Micro Architecture Probe (MAProbe) + +* Memory access latency test \ No newline at end of file diff --git a/apps/maprobe/bandwidth-test.c b/apps/maprobe/bandwidth-test.c new file mode 100644 index 0000000000000000000000000000000000000000..549956d3c1027a95666301954ccf989a129b46d1 --- /dev/null +++ b/apps/maprobe/bandwidth-test.c @@ -0,0 +1,99 @@ +#include "maprobe.h" + +float test_l1_load_bandwidth(uint64_t size, int iter, int to_csv) +{ + // printf("stride %d linear access latency test\n", step); + // printf("range (B), read latency, iters, samples, cycles\n"); + assert(size >= _PERF_CACHELINE_SIZE_BYTE); + + // _perf_print_timer(); + _perf_start_timer(); + for (int i = 0; i < iter; i++) { + for (uint64_t address = _PERF_TEST_ADDR_BASE; address < _PERF_TEST_ADDR_BASE + size; address += _PERF_CACHELINE_SIZE_BYTE) { + __asm__ volatile ("ld a0, 0(%[addr])\n" :: [addr] "r"(address) : "a0"); + __asm__ volatile ("ld a0, 8(%[addr])\n" :: [addr] "r"(address) : "a0"); + __asm__ volatile ("ld a0, 16(%[addr])\n" :: [addr] "r"(address) : "a0"); + __asm__ volatile ("ld a0, 24(%[addr])\n" :: [addr] "r"(address) : "a0"); + __asm__ volatile ("ld a0, 32(%[addr])\n" :: [addr] "r"(address) : "a0"); + __asm__ volatile ("ld a0, 40(%[addr])\n" :: [addr] "r"(address) : "a0"); + __asm__ volatile ("ld a0, 48(%[addr])\n" :: [addr] "r"(address) : "a0"); + __asm__ volatile ("ld a0, 56(%[addr])\n" :: [addr] "r"(address) : "a0"); + } + } + _perf_end_timer(); + // _perf_print_timer(); + uint64_t total_access = size / _PERF_CACHELINE_SIZE_BYTE * 8 * iter; + float bandwidth = total_access * 8 * BYTE / (float)perf.cycle; + if (to_csv) { + printf("%ld, %f, %d, %ld, %ld\n", size, (float)perf.cycle / total_access, iter, total_access, perf.cycle); + } else { + printf("range %ldKB (%d iters) dcache linear (8Byte) read, latency %f, throughput %f B/cycle (%ld samples, %ld cycles), stride %dB\n", + size/KB, iter, (float)perf.cycle / total_access, bandwidth, total_access, perf.cycle, 8 + ); + } + _perf_g_total_samples += total_access; + return bandwidth; +} + +float test_l1_store_bandwidth(uint64_t size, int iter, int to_csv) +{ + // printf("stride %d linear access latency test\n", step); + // printf("range (B), read latency, iters, samples, cycles\n"); + assert(size >= _PERF_CACHELINE_SIZE_BYTE); + + // _perf_print_timer(); + _perf_start_timer(); + for (int i = 0; i < iter; i++) { + for (uint64_t address = _PERF_TEST_ADDR_BASE; address < _PERF_TEST_ADDR_BASE + size; address += _PERF_CACHELINE_SIZE_BYTE) { + __asm__ volatile ("sd a0, 0(%[addr])\n" :: [addr] "r"(address) : "a0"); + __asm__ volatile ("sd a0, 8(%[addr])\n" :: [addr] "r"(address) : "a0"); + __asm__ volatile ("sd a0, 16(%[addr])\n" :: [addr] "r"(address) : "a0"); + __asm__ volatile ("sd a0, 24(%[addr])\n" :: [addr] "r"(address) : "a0"); + __asm__ volatile ("sd a0, 32(%[addr])\n" :: [addr] "r"(address) : "a0"); + __asm__ volatile ("sd a0, 40(%[addr])\n" :: [addr] "r"(address) : "a0"); + __asm__ volatile ("sd a0, 48(%[addr])\n" :: [addr] "r"(address) : "a0"); + __asm__ volatile ("sd a0, 56(%[addr])\n" :: [addr] "r"(address) : "a0"); + } + } + _perf_end_timer(); + // _perf_print_timer(); + uint64_t total_access = size / _PERF_CACHELINE_SIZE_BYTE * 8 * iter; + float bandwidth = total_access * 8 * BYTE / (float)perf.cycle; + if (to_csv) { + printf("%ld, %f, %d, %ld, %ld\n", size, (float)perf.cycle / total_access, iter, total_access, perf.cycle); + } else { + printf("range %ldKB (%d iters) dcache linear (8Byte) store latency %f, throughput %f B/cycle (%ld samples, %ld cycles), stride %dB\n", + size/KB, iter, (float)perf.cycle / total_access, bandwidth, total_access, perf.cycle, 8 + ); + } + _perf_g_total_samples += total_access; + return bandwidth; +} + +float test_l1_store_wcb_bandwidth(uint64_t size, int iter, int to_csv) +{ + // printf("stride %d linear access latency test\n", step); + // printf("range (B), read latency, iters, samples, cycles\n"); + assert(size >= _PERF_CACHELINE_SIZE_BYTE); + + // _perf_print_timer(); + _perf_start_timer(); + for (int i = 0; i < iter; i++) { + for (uint64_t address = _PERF_TEST_ADDR_BASE; address < _PERF_TEST_ADDR_BASE + size; address += _PERF_CACHELINE_SIZE_BYTE) { + __asm__ volatile ("sd a0, 0(%[addr])\n" :: [addr] "r"(address) : "a0"); + } + } + _perf_end_timer(); + // _perf_print_timer(); + uint64_t total_access = size / _PERF_CACHELINE_SIZE_BYTE * iter; + float bandwidth = total_access * _PERF_CACHELINE_SIZE_BYTE / (float)perf.cycle; + if (to_csv) { + printf("%ld, %f, %d, %ld, %ld\n", size, (float)perf.cycle / total_access, iter, total_access, perf.cycle); + } else { + printf("range %ldKB (%d iters) dcache linear (8Byte) store latency %f, throughput %f B/cycle (L1-L2 %f B/cycle) (%ld samples, %ld cycles), stride %dB\n", + size/KB, iter, (float)perf.cycle / total_access, total_access * 8 * BYTE / (float)perf.cycle, total_access * _PERF_CACHELINE_SIZE_BYTE / (float)perf.cycle, total_access, perf.cycle, _PERF_CACHELINE_SIZE_BYTE + ); + } + _perf_g_total_samples += total_access; + return bandwidth; +} \ No newline at end of file diff --git a/apps/maprobe/bitutils.c b/apps/maprobe/bitutils.c new file mode 100644 index 0000000000000000000000000000000000000000..cfad50f8ce951e25f393a6dd8befeaca3d189614 --- /dev/null +++ b/apps/maprobe/bitutils.c @@ -0,0 +1,36 @@ +#include "bitutils.h" + +inline uint64_t _perf_get_bit_mask(int low, int high) { + assert(low < high); + assert(low >= 0); + assert(high < 63); + return ((1 << high) - 1) >> low << low; +} + +inline uint64_t _perf_get_bits(uint64_t raw_data, int low, int high) { + assert(low < high); + assert(low >= 0); + assert(high < 63); + uint64_t mask = (1 << high) - 1; + return (raw_data & mask) >> low; +} + +inline uint64_t _perf_get_bit(uint64_t raw_data, int position) { + assert(position >= 0); + assert(position <= 63); + return (raw_data >> position) & 1; +} + +inline uint64_t _perf_set_bits(uint64_t raw_data, int low, int high, uint64_t new_value) { + assert(low < high); + assert(low >= 0); + assert(high < 63); + int mask = _perf_get_bit_mask(low, high); + return (raw_data & mask) | ((new_value << low) & mask); +} + +inline uint64_t _perf_set_bit(uint64_t raw_data, int position, int new_value) { + assert(position >= 0); + assert(position <= 63); + return raw_data & ((new_value & 1) << position); +} diff --git a/apps/maprobe/common.c b/apps/maprobe/common.c new file mode 100644 index 0000000000000000000000000000000000000000..2b5e7a961aa9d0e91ebfef3688c1c563a6544bf1 --- /dev/null +++ b/apps/maprobe/common.c @@ -0,0 +1,49 @@ +#include "maprobe.h" + +struct perf perf; +uint64_t _perf_g_total_samples = 0; + +void _perf_start_timer() +{ +#ifndef PERF_SIM + perf.instrcnt = csr_read(CSR_MINSTRET); + perf.cycle = csr_read(CSR_MCYCLE); +#endif +} + +void _perf_end_timer() +{ +#ifndef PERF_SIM + perf.cycle = csr_read(CSR_MCYCLE) - perf.cycle; + perf.instrcnt = csr_read(CSR_MINSTRET) - perf.instrcnt; +#endif +} + +void _perf_print_timer() +{ + printf("cycle %d inst %d ipc %lf\n", perf.cycle, perf.instrcnt, (float)perf.instrcnt/perf.cycle); +} + +void _perf_calibrate() +{ +#ifndef PERF_SIM + // csr read delay + uint64_t cycle_1 = csr_read(CSR_MCYCLE); + uint64_t cycle_2 = csr_read(CSR_MCYCLE); + perf.csr_read_cycle = cycle_2-cycle_1; + printf("perf_calibrate: csr_read_cycle %d\n", perf.csr_read_cycle); + + // csr read inst cost + uint64_t inst_1 = csr_read(CSR_MINSTRET); + uint64_t inst_2 = csr_read(CSR_MINSTRET); + perf.csr_read_ninst = inst_2-inst_1; + printf("perf_calibrate: csr_read_ninst %d\n", perf.csr_read_ninst); +#else + printf("running in simulation environment, hpm read disabled\n"); +#endif +} + +void _perf_blackhole(uint64_t value) +{ + *(uint64_t*) _PERF_BLACKHOLE = value; +} diff --git a/apps/maprobe/include/bitutils.h b/apps/maprobe/include/bitutils.h new file mode 100644 index 0000000000000000000000000000000000000000..f3b7f54be8711d5558f8905af664d85c3012972f --- /dev/null +++ b/apps/maprobe/include/bitutils.h @@ -0,0 +1,14 @@ +// bit op utils for perf + +#ifndef PROBE_BITUTILS_H +#define PROBE_BITUTILS_H + +#include + +extern uint64_t _perf_get_bit_mask(int low, int high); +extern uint64_t _perf_get_bits(uint64_t raw_data, int low, int high); +extern uint64_t _perf_get_bit(uint64_t raw_data, int position); +extern uint64_t _perf_set_bits(uint64_t raw_data, int low, int high, uint64_t new_value); +extern uint64_t _perf_set_bit(uint64_t raw_data, int position, int new_value); + +#endif \ No newline at end of file diff --git a/apps/maprobe/include/maprobe.h b/apps/maprobe/include/maprobe.h index ace1682c2e3f023e9c2dd7cef841e3ed2b43064b..85695a8bb515f7b0a03ea1a7ea760a5ee59f9f2f 100644 --- a/apps/maprobe/include/maprobe.h +++ b/apps/maprobe/include/maprobe.h @@ -5,6 +5,11 @@ #include #include +#include "bitutils.h" +#include "resultmat.h" + +// config +// #define PERF_SIM // probe run in simulatior, diaable perf counters // perf const #define BYTE (1) @@ -13,15 +18,29 @@ #define GB (1024*MB) // platform dependent const -// #define _PERF_TEST_ADDR_BASE 0x80400000 -#define _PERF_TEST_ADDR_BASE 0x2000400000 +#ifndef _PERF_TEST_ADDR_BASE +#define _PERF_TEST_ADDR_BASE 0x80400000 +// #define _PERF_TEST_ADDR_BASE 0x2000400000 +#endif #define _PERF_CACHELINE_SIZE_BYTE (64 * BYTE) -#define _PERF_L1_NOALIAS_SIZE_BYTE (32 * KB) -#define _PERF_L1_SIZE_BYTE (128 * KB) -#define _PERF_L2_SIZE_BYTE (512 * KB) -#define _PERF_L3_SIZE_BYTE (2 * MB) -#define _PERF_L1_NUM_WAYS 8 -#define _PERF_SET_SIZE_BYTE (_PERF_L1_SIZE_BYTE / _PERF_L1_NUM_WAYS) +#define _PERF_PAGE_SIZE_BYTE (4 * KB) +#define _PERF_L1_NOALIAS_SIZE_BYTE (16 * KB) +#define _PERF_L1_SIZE_BYTE (64 * KB) +#define _PERF_L2_SIZE_BYTE (1 * MB) +#define _PERF_L3_SIZE_BYTE (6 * MB) +#define _PERF_MEM_SIZE_BYTE (1024 * MB) +#define _PERF_L1_NUM_WAYS 4 +#define _PERF_L1_NUM_SETS 256 +#define _PERF_L2_NUM_WAYS 8 +#define _PERF_L2_NUM_SLICES 4 +#define _PERF_L2_NUM_SETS 512 + +#define _PERF_ADDR_STRIDE_L1_SAME_BANK _PERF_CACHELINE_SIZE_BYTE +#define _PERF_ADDR_STRIDE_L1_SAME_SET (_PERF_L1_NUM_SETS * _PERF_CACHELINE_SIZE_BYTE) +#define _PERF_ADDR_STRIDE_L2_SAME_SLICE (_PERF_L2_NUM_SLICES * _PERF_CACHELINE_SIZE_BYTE) +#define _PERF_ADDR_STRIDE_L1_SAME_SET (_PERF_L1_NUM_SETS * _PERF_CACHELINE_SIZE_BYTE) +#define _PERF_ADDR_STRIDE_L2_SAME_SET (_PERF_L2_NUM_SLICES * _PERF_L2_NUM_SETS * _PERF_CACHELINE_SIZE_BYTE) +#define _PERF_ADDR_STRIDE_NEXT_PAGE (_PERF_PAGE_SIZE_BYTE) // probe const #define _PERF_BLACKHOLE _PERF_TEST_ADDR_BASE @@ -29,128 +48,51 @@ struct perf { // const to be calibrated at run time - uint64_t csr_read_cycle; //# of cycles to read mcycle + uint64_t csr_read_cycle; // # of cycles to read mcycle uint64_t csr_read_ninst; // # of inst needed to read minstret // timer uint64_t cycle; uint64_t instrcnt; -} perf; - -void _perf_start_timer() -{ - perf.cycle = csr_read(CSR_MCYCLE); - perf.instrcnt = csr_read(CSR_MINSTRET); -} - -void _perf_end_timer() -{ - perf.cycle = csr_read(CSR_MCYCLE) - perf.cycle; - perf.instrcnt = csr_read(CSR_MINSTRET) - perf.instrcnt; -} - -void _perf_print_timer() -{ - printf("cycle %d inst %d ipc %lf\n", perf.cycle, perf.instrcnt, (float)perf.instrcnt/perf.cycle); -} - -void _perf_calibrate() -{ - // csr read delay - uint64_t cycle_1 = csr_read(CSR_MCYCLE); - uint64_t cycle_2 = csr_read(CSR_MCYCLE); - perf.csr_read_cycle = cycle_2-cycle_1; - printf("perf_calibrate: csr_read_cycle %d\n", perf.csr_read_cycle); - - // csr read inst cost - uint64_t inst_1 = csr_read(CSR_MINSTRET); - uint64_t inst_2 = csr_read(CSR_MINSTRET); - perf.csr_read_ninst = inst_2-inst_1; - printf("perf_calibrate: csr_read_ninst %d\n", perf.csr_read_ninst); -} - -void _perf_blackhole(uint64_t value) -{ - *(uint64_t*) _PERF_BLACKHOLE = value; -} - -uint64_t setup_latency_test_linklist(uint64_t base_addr, uint64_t end_addr, uint64_t step) -{ - uint64_t num_valid_node = 0; - assert(step % 8 == 0); - assert(step >= 8); - for (uint64_t cur_addr = base_addr; cur_addr < end_addr;) { - uint64_t next_addr = cur_addr + step; - *((uint64_t*)cur_addr) = next_addr; - cur_addr = next_addr; - num_valid_node++; - } - return num_valid_node; -} - -uint64_t read_latency_test_linklist(uint64_t base_addr, uint64_t num_valid_node) -{ - uint64_t cur_addr = base_addr; - for (int i = 0; i < num_valid_node; i++) { - cur_addr = (*(uint64_t*)cur_addr); - } - return cur_addr; -} - -void warmup(uint64_t base_addr, uint64_t end_addr) -{ - setup_latency_test_linklist(base_addr, end_addr, _PERF_CACHELINE_SIZE_BYTE); -} - -void test_latency(uint64_t size, int iter) -{ - volatile uint64_t result = 0; // make sure compiler will not opt read_latency_test_linklist - printf("range 0x%xB (%d iters) latency test\n", size, iter); - _perf_start_timer(); - uint64_t nnode = setup_latency_test_linklist(_PERF_TEST_ADDR_BASE, _PERF_TEST_ADDR_BASE + size, _PERF_CACHELINE_SIZE_BYTE); - _perf_end_timer(); - uint64_t total_node = nnode * iter; - // _perf_print_timer(); - - _perf_start_timer(); - for (int i = 0; i < iter; i++) { - result += read_latency_test_linklist(_PERF_TEST_ADDR_BASE, nnode); - } - _perf_end_timer(); - // _perf_print_timer(); - printf("range 0x%xB (%d intrs) read latency %f (%d samples)\n", size, iter, (float)perf.cycle / total_node, total_node); - - _perf_blackhole(result); -} - -void test_mem_throughput(uint64_t iter) -{ - uint64_t remain = iter; - uint64_t result = 0; - uint64_t access_addr = _PERF_TEST_ADDR_BASE; - _perf_start_timer(); - while (remain--) { - result += *(uint64_t*) access_addr; - access_addr += _PERF_CACHELINE_SIZE_BYTE; - } - _perf_end_timer(); - *(uint64_t*) _PERF_BLACKHOLE = result; - printf("mem band width %f B/cycle (%d samples)\n", (float)iter * _PERF_CACHELINE_SIZE_BYTE / perf.cycle, iter); -} - -void test_mem_throughput_same_set(uint64_t iter) -{ - uint64_t remain = iter; - uint64_t result = 0; - uint64_t access_addr = _PERF_TEST_ADDR_BASE; - _perf_start_timer(); - while (remain--) { - result += *(uint64_t*) access_addr; - access_addr += _PERF_SET_SIZE_BYTE; - } - _perf_end_timer(); - *(uint64_t*) _PERF_BLACKHOLE = result; - printf("mem band width %f B/cycle (%d samples)\n", (float)iter * _PERF_CACHELINE_SIZE_BYTE / perf.cycle, iter); -} +}; +extern struct perf perf; + +extern uint64_t _perf_g_total_samples; + +// common perf tools +extern void _perf_start_timer(); +extern void _perf_end_timer(); +extern void _perf_print_timer(); +extern void _perf_calibrate(); +extern void _perf_blackhole(uint64_t value); + +// latency test +extern uint64_t setup_pointer_tracing_linklist(uint64_t base_addr, uint64_t end_addr, uint64_t step); +extern uint64_t read_pointer_tracing_linklist(uint64_t base_addr, uint64_t num_valid_node); +extern void latency_test_warmup(uint64_t base_addr, uint64_t end_addr); +extern float test_pointer_tracing_latency(uint64_t size, int step, int iter, int to_csv); +extern float test_linear_access_latency(uint64_t size, uint64_t step, int iter, int to_csv); +extern float test_linear_access_latency_simple(uint64_t size, uint64_t step, int iter, int to_csv); +extern float test_linear_access_latency_batch8(uint64_t size, uint64_t step, int iter, int to_csv); +extern float test_random_access_latency(uint64_t num_access, uint64_t test_range, uint64_t test_align, int pregen_addr, int iter, int to_csv); +extern float test_same_address_load_latency(int iter, int to_csv); +extern float test_read_after_write_latency(int iter, int to_csv); +extern float test_linear_write_latency(uint64_t size, uint64_t step, int iter, int to_csv); + + +// bandwidth test +extern float test_l1_load_bandwidth(uint64_t size, int iter, int to_csv); +extern float test_l1_store_bandwidth(uint64_t size, int iter, int to_csv); +extern float test_l1_store_wcb_bandwidth(uint64_t size, int iter, int to_csv); + +// key parameter matrix generate +void generate_linear_access_latency_matrix(uint64_t step); +void generate_pointer_tracing_latency_matrix(uint64_t step); +void generate_random_access_latency_matrix(); +void generate_replacement_test_matrix(); + +// legacy test +extern void legacy_test_mem_throughput(uint64_t iter); +extern void legacy_test_mem_throughput_same_set(uint64_t iter); #endif \ No newline at end of file diff --git a/apps/maprobe/include/resultmat.h b/apps/maprobe/include/resultmat.h new file mode 100644 index 0000000000000000000000000000000000000000..629aa49b469a23e85d8a1663a9406c81f9b55462 --- /dev/null +++ b/apps/maprobe/include/resultmat.h @@ -0,0 +1,37 @@ +#ifndef PROBE_RESULT_MATRIX_H +#define PROBE_RESULT_MATRIX_H + +#include + +struct result_matrix_meta { + char* name; + char* row_name; + char* column_name; + int row_size; + int column_size; + void* result_array; + void* column_array; + void* row_array; +}; + +void print_float_result_matrix(struct result_matrix_meta* meta); +void matrix_print_example(); + +#define FOR(v,end) for (int v = 0; v < end; v++) +#define CONCAT(a,b) a##b +#define TOSTR(a) #a +#define DEFINE_FLOAT_RESULT_MATRIX(matrix_name, rowname, rowsize, columnname, columnsize) \ + struct result_matrix_meta CONCAT(matrix_name,_matrix_meta); \ + float CONCAT(matrix_name,_result_array)[rowsize][columnsize] = {0}; \ + int CONCAT(matrix_name,_column_array)[columnsize] = {0}; \ + int CONCAT(matrix_name,_row_array)[rowsize] = {0}; \ + CONCAT(matrix_name,_matrix_meta).name = TOSTR(matrix_name); \ + CONCAT(matrix_name,_matrix_meta).column_name = TOSTR(columnname); \ + CONCAT(matrix_name,_matrix_meta).row_name = TOSTR(rowname); \ + CONCAT(matrix_name,_matrix_meta).column_size = columnsize; \ + CONCAT(matrix_name,_matrix_meta).row_size = rowsize; \ + CONCAT(matrix_name,_matrix_meta).result_array = CONCAT(matrix_name,_result_array); \ + CONCAT(matrix_name,_matrix_meta).column_array = CONCAT(matrix_name,_column_array); \ + CONCAT(matrix_name,_matrix_meta).row_array = CONCAT(matrix_name,_row_array); + +#endif \ No newline at end of file diff --git a/apps/maprobe/latency-test.c b/apps/maprobe/latency-test.c new file mode 100644 index 0000000000000000000000000000000000000000..f72cd949ef8e0f4aa350201759cd97a536c6d27a --- /dev/null +++ b/apps/maprobe/latency-test.c @@ -0,0 +1,460 @@ +#include "maprobe.h" + +inline uint64_t generate_rand_address(uint64_t base_addr, uint64_t end_addr, uint64_t align) { + return (rand() % (end_addr - base_addr) + base_addr) / align * align; +} + +void generate_rand_address_array(uint64_t* dest, uint64_t base_addr, uint64_t end_addr, uint64_t align, int number) { + for (int i = 0; i < number; i++) { + *(dest + i) = generate_rand_address(base_addr, end_addr, align); + } +} + +uint64_t generate_pointer_tracing_address(uint64_t base_addr, uint64_t end_addr, uint64_t step) { + return setup_pointer_tracing_linklist(base_addr, end_addr, step); +} + +uint64_t setup_pointer_tracing_linklist(uint64_t base_addr, uint64_t end_addr, uint64_t step) +{ + uint64_t num_valid_node = 0; + assert(step % 8 == 0); + assert(step >= 8); + for (uint64_t cur_addr = base_addr; cur_addr < end_addr;) { + uint64_t next_addr = cur_addr + step; + *((uint64_t*)cur_addr) = next_addr; + cur_addr = next_addr; + num_valid_node++; + } + return num_valid_node; +} + +uint64_t read_pointer_tracing_linklist(uint64_t base_addr, uint64_t num_valid_node) +{ + uint64_t cur_addr = base_addr; + for (int i = 0; i < num_valid_node; i++) { + cur_addr = (*(uint64_t*)cur_addr); + } + return cur_addr; +} + +void latency_test_warmup(uint64_t base_addr, uint64_t end_addr) +{ + setup_pointer_tracing_linklist(base_addr, end_addr, _PERF_CACHELINE_SIZE_BYTE); +} + +float test_pointer_tracing_latency(uint64_t size, int step, int iter, int to_csv) +{ + // printf("pointer tracing latency test\n"); + // printf("range (B), read latency, iters, samples, cycles\n"); + register uint64_t result = 0; // make sure compiler will not opt read_pointer_tracing_linklist + _perf_start_timer(); + uint64_t nnode = setup_pointer_tracing_linklist(_PERF_TEST_ADDR_BASE, _PERF_TEST_ADDR_BASE + size, step); + _perf_end_timer(); + uint64_t total_node = nnode * iter; + // _perf_print_timer(); + + _perf_start_timer(); + for (int i = 0; i < iter; i++) { + result += read_pointer_tracing_linklist(_PERF_TEST_ADDR_BASE, nnode); + } + _perf_end_timer(); + // _perf_print_timer(); + float acpa = (float)perf.cycle / total_node; // average cycle per access + if (to_csv) { + printf("%ld, %f, %d, %ld, %ld\n", size, acpa, iter, total_node, perf.cycle); + } else { + printf("range %ldKB (%d iters) pointer tracing read latency %f, throughput %f B/cycle (%ld samples, %ld cycles)\n", + size/KB, iter, acpa, total_node * 8 * BYTE / (float)perf.cycle, total_node, perf.cycle + ); + } + _perf_g_total_samples += total_node; + _perf_blackhole(result); + return acpa; +} + +float test_same_address_load_latency(int iter, int to_csv) +{ + // printf("same address load latency test\n", step); + // printf("range (B), read latency, iters, samples, cycles\n"); + register uint64_t result = 0; + // _perf_print_timer(); + + _perf_start_timer(); + uint64_t address = _PERF_TEST_ADDR_BASE; + for (int i = 0; i < iter; i++) { + result += *((volatile uint64_t*) (address)); + } + _perf_end_timer(); + // _perf_print_timer(); + uint64_t total_access = iter; + float acpa = (float)perf.cycle / total_access; // average cycle per access + if (to_csv) { + printf("%ld, %f, %d, %ld, %ld\n", 0, acpa, iter, total_access, perf.cycle); + } else { + printf("same address read latency %f, throughput %f B/cycle (%ld samples, %ld cycles)\n", + acpa, total_access * 8 * BYTE / (float)perf.cycle, total_access, perf.cycle + ); + } + _perf_g_total_samples += total_access; + _perf_blackhole(result); + return acpa; +} + +float test_read_after_write_latency(int iter, int to_csv) +{ + // printf("same address store-load latency test\n", step); + // printf("range (B), read latency, iters, samples, cycles\n"); + volatile uint64_t result = 0; // make sure compiler will store data to memory + // _perf_print_timer(); + + _perf_start_timer(); + for (int i = 0; i < iter; i++) { + uint64_t address = _PERF_TEST_ADDR_BASE; + result += *((uint64_t*) (address)); + address += sizeof(uint64_t); + } + _perf_end_timer(); + // _perf_print_timer(); + uint64_t total_access = iter; + float acpa = (float)perf.cycle / total_access; // average cycle per access + if (to_csv) { + printf("%ld, %f, %d, %ld, %ld\n", 0, acpa, iter, total_access, perf.cycle); + } else { + printf("read after write latency %f, throughput %f B/cycle (%ld samples, %ld cycles)\n", + acpa, total_access * 8 * BYTE / (float)perf.cycle, total_access, perf.cycle + ); + } + _perf_g_total_samples += total_access; + _perf_blackhole(result); + return acpa; +} + +float test_linear_access_latency_simple(uint64_t size, uint64_t step, int iter, int to_csv) +{ + // printf("stride %d linear access latency test\n", step); + // printf("range (B), read latency, iters, samples, cycles\n"); + register uint64_t result = 0; + uint64_t num_access = size / step; + // _perf_print_timer(); + + _perf_start_timer(); + for (int i = 0; i < iter; i++) { + uint64_t address = _PERF_TEST_ADDR_BASE; + for (int j = 0; j < num_access; j++) { + result += *((volatile uint64_t*) (address)); + address += step; + } + } + _perf_end_timer(); + // _perf_print_timer(); + uint64_t total_access = num_access * iter; + float acpa = (float)perf.cycle / total_access; // average cycle per access + if (to_csv) { + printf("%ld, %f, %d, %ld, %ld\n", size, acpa, iter, total_access, perf.cycle); + } else { + printf("range %ldKB (%d iters) simple linear read latency %f, throughput %f B/cycle (%ld samples, %ld cycles), stride %dB\n", + size/KB, iter, acpa, total_access * 8 * BYTE / (float)perf.cycle, total_access, perf.cycle, step + ); + } + _perf_g_total_samples += total_access; + _perf_blackhole(result); + return acpa; +} + +float test_linear_access_latency_batch8(uint64_t size, uint64_t step, int iter, int to_csv) +{ + // printf("stride %d linear access latency test\n", step); + // printf("range (B), read latency, iters, samples, cycles\n"); + uint64_t num_access = size / step; + num_access += num_access % 8 ? 8 - num_access % 8 : 0; + assert(num_access >= 8); + // prepare access offset + register uint64_t address_offset_1 = step * 1; + register uint64_t address_offset_2 = step * 2; + register uint64_t address_offset_3 = step * 3; + register uint64_t address_offset_4 = step * 4; + register uint64_t address_offset_5 = step * 5; + register uint64_t address_offset_6 = step * 6; + register uint64_t address_offset_7 = step * 7; + register uint64_t address_offset_8 = step * 8; + + // _perf_print_timer(); + _perf_start_timer(); + for (int i = 0; i < iter; i++) { + uint64_t address = _PERF_TEST_ADDR_BASE; + for (int j = 0; j < num_access; j += 8) { + __asm__ volatile ( + "mv a1, %[addr]\n" + "add a2, %[addr], %[offset1]\n" + "add a3, %[addr], %[offset2]\n" + "add a4, %[addr], %[offset3]\n" + "add a5, %[addr], %[offset4]\n" + "add t0, %[addr], %[offset5]\n" + "add t1, %[addr], %[offset6]\n" + "add t2, %[addr], %[offset7]\n" + "ld a0, 0(a1)\n" + "ld a0, 0(a2)\n" + "ld a0, 0(a3)\n" + "ld a0, 0(a4)\n" + "ld a0, 0(a5)\n" + "ld a0, 0(t0)\n" + "ld a0, 0(t1)\n" + "ld a0, 0(t2)\n" + :: + [offset1] "r"(address_offset_1), + [offset2] "r"(address_offset_2), + [offset3] "r"(address_offset_3), + [offset4] "r"(address_offset_4), + [offset5] "r"(address_offset_5), + [offset6] "r"(address_offset_6), + [offset7] "r"(address_offset_7), + [addr] "r"(address) + : "a0", "a1", "a2", "a3", "a4", "a5", "t0", "t1", "t2", "t3" + ); + address += address_offset_8; + // register uint64_t access_addr_0 = address + address_offset_0; + // register uint64_t access_addr_1 = address + address_offset_1; + // register uint64_t access_addr_2 = address + address_offset_2; + // register uint64_t access_addr_3 = address + address_offset_3; + // register uint64_t access_addr_4 = address + address_offset_4; + // register uint64_t access_addr_5 = address + address_offset_5; + // register uint64_t access_addr_6 = address + address_offset_6; + // register uint64_t access_addr_7 = address + address_offset_7; + // address += address_offset_8; + // __asm__ volatile ("ld a0, 0(%[addr])\n" :: [addr] "r"(access_addr_0) : "a0"); + // __asm__ volatile ("ld a0, 0(%[addr])\n" :: [addr] "r"(access_addr_1) : "a0"); + // __asm__ volatile ("ld a0, 0(%[addr])\n" :: [addr] "r"(access_addr_2) : "a0"); + // __asm__ volatile ("ld a0, 0(%[addr])\n" :: [addr] "r"(access_addr_3) : "a0"); + // __asm__ volatile ("ld a0, 0(%[addr])\n" :: [addr] "r"(access_addr_4) : "a0"); + // __asm__ volatile ("ld a0, 0(%[addr])\n" :: [addr] "r"(access_addr_5) : "a0"); + // __asm__ volatile ("ld a0, 0(%[addr])\n" :: [addr] "r"(access_addr_6) : "a0"); + // __asm__ volatile ("ld a0, 0(%[addr])\n" :: [addr] "r"(access_addr_7) : "a0"); + } + } + _perf_end_timer(); + // _perf_print_timer(); + uint64_t total_access = num_access * iter; + float acpa = (float)perf.cycle / total_access; // average cycle per access + if (to_csv) { + printf("%ld, %f, %d, %ld, %ld\n", size, acpa, iter, total_access, perf.cycle); + } else { + printf("range %ldKB (%d iters) batch(8) linear read latency %f, throughput %f B/cycle (%ld samples, %ld cycles), stride %dB\n", + size/KB, iter, acpa, total_access * 8 * BYTE / (float)perf.cycle, total_access, perf.cycle, step + ); + } + _perf_g_total_samples += total_access; + return acpa; +} + +float test_linear_write_latency_batch8(uint64_t size, uint64_t step, int iter, int to_csv) +{ + // printf("stride %d linear access latency test\n", step); + // printf("range (B), read latency, iters, samples, cycles\n"); + uint64_t num_access = size / step; + num_access += num_access % 8 ? 8 - num_access % 8 : 0; + assert(num_access >= 8); + // prepare access offset + uint64_t address_offset_0 = 0; + register uint64_t address_offset_1 = step * 1; + register uint64_t address_offset_2 = step * 2; + register uint64_t address_offset_3 = step * 3; + register uint64_t address_offset_4 = step * 4; + register uint64_t address_offset_5 = step * 5; + register uint64_t address_offset_6 = step * 6; + register uint64_t address_offset_7 = step * 7; + register uint64_t address_offset_8 = step * 8; + + // _perf_print_timer(); + _perf_start_timer(); + for (int i = 0; i < iter; i++) { + uint64_t address = _PERF_TEST_ADDR_BASE; + for (int j = 0; j < num_access; j += 8) { + register uint64_t access_addr_0 = address + address_offset_0; + register uint64_t access_addr_1 = address + address_offset_1; + register uint64_t access_addr_2 = address + address_offset_2; + register uint64_t access_addr_3 = address + address_offset_3; + register uint64_t access_addr_4 = address + address_offset_4; + register uint64_t access_addr_5 = address + address_offset_5; + register uint64_t access_addr_6 = address + address_offset_6; + register uint64_t access_addr_7 = address + address_offset_7; + address += address_offset_8; + __asm__ volatile ("sd a0, 0(%[addr])\n" :: [addr] "r"(access_addr_0) : "a0"); + __asm__ volatile ("sd a0, 0(%[addr])\n" :: [addr] "r"(access_addr_1) : "a0"); + __asm__ volatile ("sd a0, 0(%[addr])\n" :: [addr] "r"(access_addr_2) : "a0"); + __asm__ volatile ("sd a0, 0(%[addr])\n" :: [addr] "r"(access_addr_3) : "a0"); + __asm__ volatile ("sd a0, 0(%[addr])\n" :: [addr] "r"(access_addr_4) : "a0"); + __asm__ volatile ("sd a0, 0(%[addr])\n" :: [addr] "r"(access_addr_5) : "a0"); + __asm__ volatile ("sd a0, 0(%[addr])\n" :: [addr] "r"(access_addr_6) : "a0"); + __asm__ volatile ("sd a0, 0(%[addr])\n" :: [addr] "r"(access_addr_7) : "a0"); + } + } + _perf_end_timer(); + // _perf_print_timer(); + uint64_t total_access = num_access * iter; + float acpa = (float)perf.cycle / total_access; // average cycle per access + if (to_csv) { + printf("%ld, %f, %d, %ld, %ld\n", size, acpa, iter, total_access, perf.cycle); + } else { + printf("range %ldKB (%d iters) batch(8) linear write latency %f, throughput %f B/cycle (%ld samples, %ld cycles), stride %dB\n", + size/KB, iter, acpa, total_access * 8 * BYTE / (float)perf.cycle, total_access, perf.cycle, step + ); + } + _perf_g_total_samples += total_access; + return acpa; +} + +float test_linear_access_latency(uint64_t size, uint64_t step, int iter, int to_csv) +{ + return test_linear_access_latency_batch8(size, step, iter, to_csv); +} + +float test_linear_write_latency(uint64_t size, uint64_t step, int iter, int to_csv) +{ + return test_linear_write_latency_batch8(size, step, iter, to_csv); +} + +float test_random_access_latency(uint64_t num_access, uint64_t test_range, uint64_t test_align, int pregen_addr, int iter, int to_csv) +{ + // printf("align %d random access (cache line) latency test, %s\n", + // test_align, pregen_addr ? "use pregen addr array" : "gen rand addr at run time" + // ); + // printf("range (B), read latency, iters, samples, cycles\n"); + register uint64_t result = 0; + // _perf_print_timer(); + + uint64_t total_access = num_access * iter; + if (test_range > total_access*8*_PERF_CACHELINE_SIZE_BYTE) { + printf("total access size %ldKB less than test range %ldKB, ignored\n", + total_access*8*_PERF_CACHELINE_SIZE_BYTE/KB, + test_range/KB + ); + return 0; + } + + // alloc memory for random access addr array and data + assert(test_align >= 8 * BYTE); + // assert(size >= test_align); + // uint64_t num_access = size / test_align; + if (pregen_addr) { + uint64_t test_array_base_addr = _PERF_TEST_ADDR_BASE + num_access * sizeof(uint64_t*); + uint64_t address_array_base_addr = _PERF_TEST_ADDR_BASE; + generate_rand_address_array((uint64_t*)address_array_base_addr, test_array_base_addr, test_array_base_addr + test_range, test_align, num_access); + _perf_start_timer(); + for (int i = 0; i < iter; i++) { + for (int j = 0; j < num_access; j++) { + result += *((uint64_t*) (address_array_base_addr + j * sizeof(uint64_t*))); + } + } + _perf_end_timer(); + } else { + _perf_start_timer(); + for (int i = 0; i < iter; i++) { + for (int j = 0; j < num_access; j++) { + result += *((uint64_t*) (generate_rand_address(_PERF_TEST_ADDR_BASE, _PERF_TEST_ADDR_BASE + test_range, test_align))); + } + } + _perf_end_timer(); + } + // _perf_print_timer(); + float acpa = (float)perf.cycle / total_access; // average cycle per access + if (to_csv) { + printf("%ld, %f, %d, %ld, %ld\n", test_range, acpa, iter, total_access, perf.cycle); + } else { + printf("range %ldKB, access %ldKB (cover %ldKB) (%d iters) random read latency %f, throughput %f B/cycle (%ld samples, %ld cycles), align %ldB, %s\n", + test_range/KB, total_access*8*BYTE/KB, total_access*8*_PERF_CACHELINE_SIZE_BYTE/KB, iter, acpa, total_access * 8 * BYTE / (float)perf.cycle, total_access, perf.cycle, test_align, + pregen_addr ? "pregen addr" : "runtime addr" + ); + } + _perf_g_total_samples += total_access; + _perf_blackhole(result); + return acpa; +} + +void legacy_test_mem_throughput(uint64_t iter) +{ + uint64_t remain = iter; + uint64_t result = 0; + uint64_t access_addr = _PERF_TEST_ADDR_BASE; + _perf_start_timer(); + while (remain--) { + result += *(uint64_t*) access_addr; + access_addr += _PERF_CACHELINE_SIZE_BYTE; + } + _perf_end_timer(); + *(uint64_t*) _PERF_BLACKHOLE = result; + printf("mem band width %f B/cycle (%d samples)\n", (float)iter * _PERF_CACHELINE_SIZE_BYTE / perf.cycle, iter); +} + +void legacy_test_mem_throughput_same_set(uint64_t iter) +{ + uint64_t remain = iter; + uint64_t result = 0; + uint64_t access_addr = _PERF_TEST_ADDR_BASE; + _perf_start_timer(); + while (remain--) { + result += *(uint64_t*) access_addr; + access_addr += _PERF_ADDR_STRIDE_L1_SAME_SET; + } + _perf_end_timer(); + *(uint64_t*) _PERF_BLACKHOLE = result; + printf("mem band width %f B/cycle (%d samples)\n", (float)iter * _PERF_CACHELINE_SIZE_BYTE / perf.cycle, iter); +} + +void generate_linear_access_latency_matrix(uint64_t step) +{ + // step can be _PERF_CACHELINE_SIZE_BYTE or 8*BYTE +#define LINEAR_ACCESS_MATRIX_SIZE_MAX_POW2_KB 14 + // LINEAR_ACCESS_MATRIX_SIZE_MAX_POW2_KB 14: 14 cases in total, from 1KB to 8MB + DEFINE_FLOAT_RESULT_MATRIX(linear_access_latency,size_kb_pow2,LINEAR_ACCESS_MATRIX_SIZE_MAX_POW2_KB,iter,3); + FOR(x,LINEAR_ACCESS_MATRIX_SIZE_MAX_POW2_KB) { linear_access_latency_row_array[x] = x; } + FOR(x,3) { linear_access_latency_column_array[x] = x; } + for (int i = 0; i < LINEAR_ACCESS_MATRIX_SIZE_MAX_POW2_KB; i++) { + int warm_up_iter = i < 6 ? 4 : 1; + int test_iter = i < 6 ? 4 : 2; + linear_access_latency_result_array[i][0] = test_linear_access_latency((1< +#include "maprobe.h" + +void typical_linear_load_test_set() +{ + _perf_calibrate(); + printf("------------- linear load test set -------------\n"); + printf("page size linear double word load:\n"); + test_linear_access_latency(_PERF_PAGE_SIZE_BYTE, sizeof(uint64_t), 1, 0); + test_linear_access_latency(_PERF_PAGE_SIZE_BYTE, sizeof(uint64_t), 2, 0); + printf("page size linear cache line load:\n"); + test_linear_access_latency(_PERF_PAGE_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 1, 0); + test_linear_access_latency(_PERF_PAGE_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 2, 0); + printf("dcache/2 linear double word load:\n"); + test_linear_access_latency(_PERF_L1_SIZE_BYTE / 2, sizeof(uint64_t), 1, 0); + test_linear_access_latency(_PERF_L1_SIZE_BYTE / 2, sizeof(uint64_t), 2, 0); + printf("dcache/2 linear cache line load:\n"); + test_linear_access_latency(_PERF_L1_SIZE_BYTE / 2, _PERF_CACHELINE_SIZE_BYTE, 1, 0); + test_linear_access_latency(_PERF_L1_SIZE_BYTE / 2, _PERF_CACHELINE_SIZE_BYTE, 2, 0); + printf("dcache linear double word load:\n"); + test_linear_access_latency(_PERF_L1_SIZE_BYTE, sizeof(uint64_t), 1, 0); + test_linear_access_latency(_PERF_L1_SIZE_BYTE, sizeof(uint64_t), 2, 0); + printf("dcache linear cache line load:\n"); + test_linear_access_latency(_PERF_L1_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 1, 0); + test_linear_access_latency(_PERF_L1_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 2, 0); + printf("L2 linear cache line load:\n"); + test_linear_access_latency(_PERF_L2_SIZE_BYTE / 2, _PERF_CACHELINE_SIZE_BYTE, 1, 0); + test_linear_access_latency(_PERF_L2_SIZE_BYTE / 2, _PERF_CACHELINE_SIZE_BYTE, 2, 0); + printf("L1 (L1 same set) linear cache line load:\n"); + test_linear_access_latency(_PERF_L1_SIZE_BYTE, _PERF_ADDR_STRIDE_L1_SAME_SET, 10, 0); + test_linear_access_latency(_PERF_L1_SIZE_BYTE, _PERF_ADDR_STRIDE_L1_SAME_SET, 100, 0); + printf("L2 (L1 same set) linear cache line load:\n"); + test_linear_access_latency(_PERF_L2_SIZE_BYTE, _PERF_ADDR_STRIDE_L1_SAME_SET, 2, 0); + test_linear_access_latency(_PERF_L2_SIZE_BYTE, _PERF_ADDR_STRIDE_L1_SAME_SET, 4, 0); + printf("L1 (L2 same slice) linear cache line load:\n"); + test_linear_access_latency(_PERF_L1_SIZE_BYTE, _PERF_ADDR_STRIDE_L2_SAME_SLICE, 1, 0); + test_linear_access_latency(_PERF_L1_SIZE_BYTE, _PERF_ADDR_STRIDE_L2_SAME_SLICE, 2, 0); + printf("L2 (L2 same slice) linear cache line load:\n"); + test_linear_access_latency(_PERF_L2_SIZE_BYTE, _PERF_ADDR_STRIDE_L2_SAME_SLICE, 1, 0); + test_linear_access_latency(_PERF_L2_SIZE_BYTE, _PERF_ADDR_STRIDE_L2_SAME_SLICE, 2, 0); + printf("L1 (page traverse) linear cache line load:\n"); + test_linear_access_latency(_PERF_L1_SIZE_BYTE, _PERF_ADDR_STRIDE_NEXT_PAGE, 10, 0); + test_linear_access_latency(_PERF_L1_SIZE_BYTE, _PERF_ADDR_STRIDE_NEXT_PAGE, 100, 0); + printf("L2 (page traverse) linear cache line load:\n"); + test_linear_access_latency(_PERF_L2_SIZE_BYTE, _PERF_ADDR_STRIDE_NEXT_PAGE, 2, 0); + test_linear_access_latency(_PERF_L2_SIZE_BYTE, _PERF_ADDR_STRIDE_NEXT_PAGE, 4, 0); + printf("total samples: %ld\n", _perf_g_total_samples); +} + +void typical_random_load_test_set() +{ + printf("------------- random load test set -------------\n"); + printf("from page size random load (word):\n"); + test_random_access_latency(1024, _PERF_PAGE_SIZE_BYTE, 8*BYTE, 1, 1, 0); + test_random_access_latency(1024, _PERF_PAGE_SIZE_BYTE, 8*BYTE, 0, 1, 0); + printf("from page size random load (cache line):\n"); + test_random_access_latency(1024, _PERF_PAGE_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 1, 1, 0); + test_random_access_latency(1024, _PERF_PAGE_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 0, 1, 0); + printf("from dcache/2 size random load (word):\n"); + test_random_access_latency(1024, _PERF_L1_SIZE_BYTE/2, 8*BYTE, 1, 1, 0); + test_random_access_latency(1024, _PERF_L1_SIZE_BYTE/2, 8*BYTE, 0, 1, 0); + printf("from dcache/2 size random load (cache line):\n"); + test_random_access_latency(1024, _PERF_L1_SIZE_BYTE/2, _PERF_CACHELINE_SIZE_BYTE, 1, 1, 0); + test_random_access_latency(1024, _PERF_L1_SIZE_BYTE/2, _PERF_CACHELINE_SIZE_BYTE, 0, 1, 0); + printf("from dcache size random load (word):\n"); + test_random_access_latency(_PERF_L1_SIZE_BYTE/_PERF_CACHELINE_SIZE_BYTE*2, _PERF_L1_SIZE_BYTE, 8*BYTE, 1, 1, 0); + test_random_access_latency(_PERF_L1_SIZE_BYTE/_PERF_CACHELINE_SIZE_BYTE*2, _PERF_L1_SIZE_BYTE, 8*BYTE, 0, 1, 0); + printf("from dcache size random load (cache line):\n"); + test_random_access_latency(_PERF_L1_SIZE_BYTE/_PERF_CACHELINE_SIZE_BYTE*2, _PERF_L1_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 1, 1, 0); + test_random_access_latency(_PERF_L1_SIZE_BYTE/_PERF_CACHELINE_SIZE_BYTE*2, _PERF_L1_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 0, 1, 0); + printf("from l2 size random load (word):\n"); + test_random_access_latency(_PERF_L1_SIZE_BYTE/_PERF_CACHELINE_SIZE_BYTE*2, _PERF_L2_SIZE_BYTE, 8*BYTE, 1, 1, 0); + test_random_access_latency(_PERF_L1_SIZE_BYTE/_PERF_CACHELINE_SIZE_BYTE*2, _PERF_L2_SIZE_BYTE, 8*BYTE, 0, 1, 0); + printf("from l2 size random load (cache line):\n"); + test_random_access_latency(_PERF_L1_SIZE_BYTE/_PERF_CACHELINE_SIZE_BYTE*2, _PERF_L2_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 1, 1, 0); + test_random_access_latency(_PERF_L1_SIZE_BYTE/_PERF_CACHELINE_SIZE_BYTE*2, _PERF_L2_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 0, 1, 0); + printf("total samples: %ld\n", _perf_g_total_samples); +} + +void typical_pointer_tracing_load_test_set() +{ + printf("------------- pointer tracing load test set -------------\n"); + printf("dobule word by dobule word tracing:\n"); + test_pointer_tracing_latency(_PERF_PAGE_SIZE_BYTE, 8*BYTE, 10, 0); + test_pointer_tracing_latency(_PERF_L1_SIZE_BYTE/2, 8*BYTE, 2, 0); + test_pointer_tracing_latency(_PERF_L1_SIZE_BYTE, 8*BYTE, 2, 0); + test_pointer_tracing_latency(_PERF_L2_SIZE_BYTE/2, 8*BYTE, 2, 0); + printf("cacheline by cacheline tracing:\n"); + test_pointer_tracing_latency(_PERF_PAGE_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 10, 0); + test_pointer_tracing_latency(_PERF_L1_SIZE_BYTE/2, _PERF_CACHELINE_SIZE_BYTE, 2, 0); + test_pointer_tracing_latency(_PERF_L1_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 2, 0); + test_pointer_tracing_latency(_PERF_L2_SIZE_BYTE/2, _PERF_CACHELINE_SIZE_BYTE, 2, 0); + test_pointer_tracing_latency(_PERF_L2_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 1, 0); + test_pointer_tracing_latency(_PERF_L3_SIZE_BYTE/2, _PERF_CACHELINE_SIZE_BYTE, 1, 0); + printf("page by page, tracing:\n"); + test_pointer_tracing_latency(_PERF_PAGE_SIZE_BYTE*2, _PERF_PAGE_SIZE_BYTE, 10, 0); + test_pointer_tracing_latency(_PERF_L1_SIZE_BYTE/2, _PERF_PAGE_SIZE_BYTE, 10, 0); + test_pointer_tracing_latency(_PERF_L1_SIZE_BYTE, _PERF_PAGE_SIZE_BYTE, 10, 0); + test_pointer_tracing_latency(_PERF_L2_SIZE_BYTE/2, _PERF_PAGE_SIZE_BYTE, 10, 0); + test_pointer_tracing_latency(_PERF_L2_SIZE_BYTE, _PERF_PAGE_SIZE_BYTE, 10, 0); + printf("total samples: %ld\n", _perf_g_total_samples); +} + +void typical_memory_disambiuation_test_set() +{ + printf("------------- memory disambiuation test set -------------\n"); + printf("load from the same address:\n"); + test_same_address_load_latency(1024, 0); + test_same_address_load_latency(1024, 0); + test_same_address_load_latency(1024, 0); + printf("load then store to the same address:\n"); + test_read_after_write_latency(1024, 0); + test_read_after_write_latency(1024, 0); + test_read_after_write_latency(1024, 0); + // more to be added +} + +void typical_l1_access_test_set() +{ + printf("------------- typical dcache access pattern test set -------------\n"); + printf("ideal load bandwidth:\n"); + test_l1_load_bandwidth(_PERF_L1_SIZE_BYTE, 2, 0); + test_l1_load_bandwidth(_PERF_L1_SIZE_BYTE, 10, 0); + printf("ideal store bandwidth:\n"); + test_l1_store_bandwidth(_PERF_L1_SIZE_BYTE, 2, 0); + test_l1_store_bandwidth(_PERF_L1_SIZE_BYTE, 10, 0); + printf("ideal write combine buffer bandwidth:\n"); + test_l1_store_wcb_bandwidth(_PERF_L1_SIZE_BYTE, 2, 0); + test_l1_store_wcb_bandwidth(_PERF_L1_SIZE_BYTE, 5, 0); + printf("replacement error penalty:\n"); + test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*32,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0); + test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*32,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0); +} + +// typical latency test for fast regression +void typical_latency_test() +{ + _perf_g_total_samples = 0; + typical_l1_access_test_set(); + typical_linear_load_test_set(); + typical_random_load_test_set(); + typical_pointer_tracing_load_test_set(); + typical_memory_disambiuation_test_set(); +} + +void pointer_tracing_graph() +{ + _perf_g_total_samples = 0; + _perf_calibrate(); + printf("data for pointer tracing latency graph:\n"); + printf("range (B), read latency, iters, samples\n"); + for (int i = 1*KB; i < 64*KB; i = i + 1*KB) { + test_pointer_tracing_latency(i, _PERF_CACHELINE_SIZE_BYTE, 2, 1); + } + for (int i = 64*KB; i < 1024*KB; i = i + 64*KB) { + test_pointer_tracing_latency(i, _PERF_CACHELINE_SIZE_BYTE, 1, 1); + } + test_pointer_tracing_latency(1024*KB, _PERF_CACHELINE_SIZE_BYTE, 1, 1); + for (int i = 1*MB; i <8*MB; i = i + 1*MB) { + test_pointer_tracing_latency(i, _PERF_CACHELINE_SIZE_BYTE, 1, 1); + } + printf("total samples: %ld\n", _perf_g_total_samples); +} + +// a simple test set used to check if test is working correctly +void latency_test_example() +{ + _perf_calibrate(); + printf("latency test example:\n"); + test_l1_load_bandwidth(4*KB, 5, 0); + test_l1_load_bandwidth(4*KB, 5, 0); + test_l1_store_bandwidth(4*KB, 5, 0); + test_l1_store_bandwidth(4*KB, 5, 0); + test_l1_store_wcb_bandwidth(8*KB, 5, 0); + test_l1_store_wcb_bandwidth(8*KB, 5, 0); + test_pointer_tracing_latency(_PERF_PAGE_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 5, 0); + test_linear_access_latency(_PERF_PAGE_SIZE_BYTE, sizeof(uint64_t), 5, 0); + test_linear_access_latency(_PERF_PAGE_SIZE_BYTE, sizeof(uint64_t), 5, 0); + test_linear_access_latency(_PERF_PAGE_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 5, 0); + test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*4,_PERF_ADDR_STRIDE_L1_SAME_SET,8,0); + test_random_access_latency(4096, 1024*MB, _PERF_CACHELINE_SIZE_BYTE, 0, 1, 0); + test_random_access_latency(4096, 1024*MB, _PERF_CACHELINE_SIZE_BYTE, 1, 1, 0); + test_same_address_load_latency(1024, 0); + test_read_after_write_latency(1024, 0); + printf("total samples: %ld\n", _perf_g_total_samples); +} + +void l2_l3_pressure_test() +{ + _perf_calibrate(); + printf("L2 and L3 same set pressure test:\n"); + for (int i = 1; i < 16; i++) { + printf("ways accessed: %d\n", i); + test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L2_SAME_SET*i,_PERF_ADDR_STRIDE_L2_SAME_SET,64,0); + test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L2_SAME_SET*i,_PERF_ADDR_STRIDE_L2_SAME_SET,64,0); + } + for (int i = 16; i <= 512; i*=2) { + printf("ways accessed: %d\n", i); + // jump at i = 32 + test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L2_SAME_SET*i,_PERF_ADDR_STRIDE_L2_SAME_SET,64,0); + } + + test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*32,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0); + test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*32,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0); + test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*64,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0); + test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*64,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0); + // jump at i = 128 + test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*128,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0); + test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*128,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0); + test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*256,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0); + test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*256,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0); + test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*512,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0); + test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*512,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0); +} + +void legacy_latency_throughput_test() +{ + _perf_calibrate(); + printf("Memory throughput:\n"); + legacy_test_mem_throughput(1024); + printf("L1 latency:\n"); + test_pointer_tracing_latency(_PERF_PAGE_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 5, 0); + test_pointer_tracing_latency(_PERF_L1_NOALIAS_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 2, 0); + test_pointer_tracing_latency(_PERF_L1_SIZE_BYTE/2, _PERF_CACHELINE_SIZE_BYTE, 2, 0); + test_pointer_tracing_latency(_PERF_L1_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 2, 0); + printf("L2 latency:\n"); + test_pointer_tracing_latency(_PERF_L2_SIZE_BYTE/2, _PERF_CACHELINE_SIZE_BYTE, 2, 0); + // test_pointer_tracing_latency(_PERF_L2_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 2, 0); + printf("L3 latency:\n"); + test_pointer_tracing_latency(_PERF_L3_SIZE_BYTE/2, _PERF_CACHELINE_SIZE_BYTE, 2, 0); + // test_pointer_tracing_latency(_PERF_L3_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE,2, 0); + // printf("MEM:\n"); + // test_pointer_tracing_latency(_PERF_L3_SIZE_BYTE*2, _PERF_CACHELINE_SIZE_BYTE,2, 0); + printf("total sampl8es: %ld\n", _perf_g_total_samples); +} + +int main() +{ + latency_test_example(); + + generate_linear_access_latency_matrix(8*BYTE); + generate_linear_access_latency_matrix(_PERF_CACHELINE_SIZE_BYTE); + generate_pointer_tracing_latency_matrix(8*BYTE); + generate_pointer_tracing_latency_matrix(_PERF_CACHELINE_SIZE_BYTE); + // generate_random_access_latency_matrix(); + generate_replacement_test_matrix(); + + // matrix_print_example(); + typical_latency_test(); + // pointer_tracing_graph(); + // latency_test(); + // legacy_latency_throughput_test(); + l2_l3_pressure_test(); + return 0; + // return 0; +} \ No newline at end of file diff --git a/apps/maprobe/maprobe.c b/apps/maprobe/maprobe.c deleted file mode 100644 index 16563d7836a93cd0199d70be6cf5225b960acdac..0000000000000000000000000000000000000000 --- a/apps/maprobe/maprobe.c +++ /dev/null @@ -1,24 +0,0 @@ -#include -#include "maprobe.h" - -int main() -{ - _perf_calibrate(); - printf("Memory throughput:\n"); - test_mem_throughput(512); - printf("L1 latency:\n"); - test_latency(4 * KB, 5); - test_latency(_PERF_L1_NOALIAS_SIZE_BYTE, 2); - test_latency(_PERF_L1_SIZE_BYTE/2, 2); - test_latency(_PERF_L1_SIZE_BYTE, 2); - printf("L2 latency:\n"); - test_latency(_PERF_L2_SIZE_BYTE/2, 2); - // test_latency(_PERF_L2_SIZE_BYTE, 2); - printf("L3 latency:\n"); - test_latency(_PERF_L3_SIZE_BYTE/2, 2); - // test_latency(_PERF_L3_SIZE_BYTE,2); - // printf("MEM:\n"); - // test_latency(_PERF_L3_SIZE_BYTE*2,2); - - return 0; -} \ No newline at end of file diff --git a/apps/maprobe/replacement-test.c b/apps/maprobe/replacement-test.c new file mode 100644 index 0000000000000000000000000000000000000000..641bce8bd3dd2e729477fc8bac3562fec12672f2 --- /dev/null +++ b/apps/maprobe/replacement-test.c @@ -0,0 +1,20 @@ +#include "maprobe.h" + +void generate_replacement_test_matrix() +{ +#define REPLACEMENT_TEST_MAX_WAY 17 // up to 16 way +#define REPLACEMENT_TEST_ITER 5 // 1 warmup + 4 test + assert(REPLACEMENT_TEST_ITER >= 2); + DEFINE_FLOAT_RESULT_MATRIX(replacement_test,num_way_accessed,REPLACEMENT_TEST_MAX_WAY,iter,REPLACEMENT_TEST_ITER); + FOR(x,REPLACEMENT_TEST_ITER) { replacement_test_column_array[x] = x; } + for (int i = 0; i < REPLACEMENT_TEST_MAX_WAY; i++) { + replacement_test_row_array[i] = i+1; + int warm_up_iter = 64; + int test_iter = i < 4 ? 256 : 64; + replacement_test_result_array[i][0] = test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*(i+1),_PERF_ADDR_STRIDE_L1_SAME_SET,warm_up_iter,0); //warmup + for(int j = 1; j < REPLACEMENT_TEST_ITER; j++) { + replacement_test_result_array[i][j] = test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*(i+1),_PERF_ADDR_STRIDE_L1_SAME_SET,test_iter,0); //test + } + } + print_float_result_matrix(&replacement_test_matrix_meta); +} \ No newline at end of file diff --git a/apps/maprobe/resultmat.c b/apps/maprobe/resultmat.c new file mode 100644 index 0000000000000000000000000000000000000000..a264f182e472c889fac4d75a461f03d0a5df2551 --- /dev/null +++ b/apps/maprobe/resultmat.c @@ -0,0 +1,55 @@ +#include "resultmat.h" + +void print_float_result_matrix(struct result_matrix_meta* meta) +{ + assert(meta); + printf("---------- %s matrix start ----------\n", meta->name); + printf("%s (row) \\ %s (column)\n", meta->row_name, meta->column_name); + if (meta->column_array) { + if (meta->row_array) { + printf("\\ , \t"); + } + for (int c = 0; c < meta->column_size; c++) { + printf(" %d,\t", *((int*)meta->column_array + c)); + } + printf("\n"); + } + for (int r = 0; r < meta->row_size; r++) { + if (meta->row_array) { + printf("%3d,\t", *((int*)meta->row_array + r)); + } + for (int c = 0; c < meta->column_size; c++) { + printf("%f,\t", *((float*)meta->result_array + r * meta->column_size + c)); + } + printf("\n"); + } + printf("---------- %s matrix end ----------\n"); +} + +void matrix_print_example() +{ + DEFINE_FLOAT_RESULT_MATRIX(test,testrow,5,testcol,10); + // ({ + // struct result_matrix_meta test_matrix_meta; + // float test_result_array[5][10] = {0}; + // int test_column_array[10] = {0}; + // int testrow_array[5] = {0}; + // test_matrix_meta.name = "test"; + // test_matrix_meta.column_name = "testcol"; + // test_matrix_meta.row_name = "testrow"; + // test_matrix_meta.column_size = 10; + // test_matrix_meta.row_size = 5; + // test_matrix_meta.result_array = test_result_array; + // test_matrix_meta.column_array = test_column_array; + // test_matrix_meta.row_array = test_row_array; + // }) + + FOR(x,5) { test_row_array[x] = x; } + FOR(x,10) { test_column_array[x] = x; } + FOR(x,5) { + FOR(y,10) { + test_result_array[x][y] = rand(); + } + } + print_float_result_matrix(&test_matrix_meta); +} \ No newline at end of file