From 421b3b8f17789180419765b28a137c4addb3da0d Mon Sep 17 00:00:00 2001 From: William Wang Date: Mon, 6 Mar 2023 17:20:30 +0800 Subject: [PATCH] maprobe: add linear read, random read, l-l vio test --- apps/maprobe/common.c | 1 + apps/maprobe/include/maprobe.h | 35 +++++-- apps/maprobe/latency-test.c | 156 ++++++++++++++++++++++++++--- apps/maprobe/main.c | 178 +++++++++++++++++++++++++++++++-- 4 files changed, 337 insertions(+), 33 deletions(-) diff --git a/apps/maprobe/common.c b/apps/maprobe/common.c index 9cd55a62..222aafc8 100644 --- a/apps/maprobe/common.c +++ b/apps/maprobe/common.c @@ -1,6 +1,7 @@ #include "maprobe.h" struct perf perf; +uint64_t _perf_g_total_samples = 0; void _perf_start_timer() { diff --git a/apps/maprobe/include/maprobe.h b/apps/maprobe/include/maprobe.h index 27a170da..b17ea0ba 100644 --- a/apps/maprobe/include/maprobe.h +++ b/apps/maprobe/include/maprobe.h @@ -22,12 +22,22 @@ // #define _PERF_TEST_ADDR_BASE 0x2000400000 #endif #define _PERF_CACHELINE_SIZE_BYTE (64 * BYTE) -#define _PERF_L1_NOALIAS_SIZE_BYTE (32 * KB) +#define _PERF_PAGE_SIZE_BYTE (4 * KB) +#define _PERF_L1_NOALIAS_SIZE_BYTE (16 * KB) #define _PERF_L1_SIZE_BYTE (64 * KB) -#define _PERF_L2_SIZE_BYTE (512 * KB) -#define _PERF_L3_SIZE_BYTE (2 * MB) +#define _PERF_L2_SIZE_BYTE (1 * MB) +#define _PERF_L3_SIZE_BYTE (6 * MB) +#define _PERF_MEM_SIZE_BYTE (1024 * MB) #define _PERF_L1_NUM_WAYS 4 -#define _PERF_SET_SIZE_BYTE (_PERF_L1_SIZE_BYTE / _PERF_L1_NUM_WAYS) +#define _PERF_L1_NUM_SETS 256 +#define _PERF_L2_NUM_SLICES 4 +// #define _PERF_L2_NUM_SETS 512 + +#define _PERF_ADDR_STRIDE_L1_SAME_BANK _PERF_CACHELINE_SIZE_BYTE +#define _PERF_ADDR_STRIDE_L1_SAME_SET (_PERF_L1_NUM_SETS * _PERF_CACHELINE_SIZE_BYTE) +#define _PERF_ADDR_STRIDE_L2_SAME_SLICE (_PERF_L2_NUM_SLICES * _PERF_CACHELINE_SIZE_BYTE) +// #define _PERF_ADDR_STRIDE_L2_SAME_SET (_PERF_L2_NUM_SETS * _PERF_CACHELINE_SIZE_BYTE) +#define _PERF_ADDR_STRIDE_NEXT_PAGE (_PERF_PAGE_SIZE_BYTE) // probe const #define _PERF_BLACKHOLE _PERF_TEST_ADDR_BASE @@ -42,9 +52,10 @@ struct perf uint64_t cycle; uint64_t instrcnt; }; - extern struct perf perf; +extern uint64_t _perf_g_total_samples; + // common perf tools extern void _perf_start_timer(); extern void _perf_end_timer(); @@ -53,11 +64,15 @@ extern void _perf_calibrate(); extern void _perf_blackhole(uint64_t value); // latency test -extern uint64_t setup_latency_test_linklist(uint64_t base_addr, uint64_t end_addr, uint64_t step); -extern uint64_t read_latency_test_linklist(uint64_t base_addr, uint64_t num_valid_node); +extern uint64_t setup_pointer_tracing_linklist(uint64_t base_addr, uint64_t end_addr, uint64_t step); +extern uint64_t read_pointer_tracing_linklist(uint64_t base_addr, uint64_t num_valid_node); extern void latency_test_warmup(uint64_t base_addr, uint64_t end_addr); -extern void test_latency(uint64_t size, int iter); -extern void test_mem_throughput(uint64_t iter); -extern void test_mem_throughput_same_set(uint64_t iter); +extern void test_pointer_tracing_latency(uint64_t size, int step, int iter, int to_csv); +extern void test_linear_access_latency(uint64_t size, uint64_t step, int iter, int to_csv); +extern void test_random_access_latency(uint64_t num_access, uint64_t test_range, uint64_t test_align, int pregen_addr, int iter, int to_csv); +extern void test_same_address_load_latency(int iter, int to_csv); + +extern void legacy_test_mem_throughput(uint64_t iter); +extern void legacy_test_mem_throughput_same_set(uint64_t iter); #endif \ No newline at end of file diff --git a/apps/maprobe/latency-test.c b/apps/maprobe/latency-test.c index 570f11e8..ad152388 100644 --- a/apps/maprobe/latency-test.c +++ b/apps/maprobe/latency-test.c @@ -1,6 +1,24 @@ #include "maprobe.h" -uint64_t setup_latency_test_linklist(uint64_t base_addr, uint64_t end_addr, uint64_t step) +// inline uint64_t get_next_linear_address(uint64_t current_addr, uint64_t step) { +// return current_addr + step; +// } + +inline uint64_t generate_rand_address(uint64_t base_addr, uint64_t end_addr, uint64_t align) { + return (rand() % (end_addr - base_addr) + base_addr) / align * align; +} + +void generate_rand_address_array(uint64_t* dest, uint64_t base_addr, uint64_t end_addr, uint64_t align, int number) { + for (int i = 0; i < number; i++) { + *(dest + i) = generate_rand_address(base_addr, end_addr, align); + } +} + +uint64_t generate_pointer_tracing_address(uint64_t base_addr, uint64_t end_addr, uint64_t step) { + return setup_pointer_tracing_linklist(base_addr, end_addr, step); +} + +uint64_t setup_pointer_tracing_linklist(uint64_t base_addr, uint64_t end_addr, uint64_t step) { uint64_t num_valid_node = 0; assert(step % 8 == 0); @@ -14,7 +32,7 @@ uint64_t setup_latency_test_linklist(uint64_t base_addr, uint64_t end_addr, uint return num_valid_node; } -uint64_t read_latency_test_linklist(uint64_t base_addr, uint64_t num_valid_node) +uint64_t read_pointer_tracing_linklist(uint64_t base_addr, uint64_t num_valid_node) { uint64_t cur_addr = base_addr; for (int i = 0; i < num_valid_node; i++) { @@ -25,31 +43,145 @@ uint64_t read_latency_test_linklist(uint64_t base_addr, uint64_t num_valid_node) void latency_test_warmup(uint64_t base_addr, uint64_t end_addr) { - setup_latency_test_linklist(base_addr, end_addr, _PERF_CACHELINE_SIZE_BYTE); + setup_pointer_tracing_linklist(base_addr, end_addr, _PERF_CACHELINE_SIZE_BYTE); } -void test_latency(uint64_t size, int iter) +void test_pointer_tracing_latency(uint64_t size, int step, int iter, int to_csv) { - volatile uint64_t result = 0; // make sure compiler will not opt read_latency_test_linklist - printf("range 0x%xB (%d iters) latency test\n", size, iter); + // printf("pointer tracing latency test\n"); + // printf("range (B), read latency, iters, samples, cycles\n"); + volatile uint64_t result = 0; // make sure compiler will not opt read_pointer_tracing_linklist _perf_start_timer(); - uint64_t nnode = setup_latency_test_linklist(_PERF_TEST_ADDR_BASE, _PERF_TEST_ADDR_BASE + size, _PERF_CACHELINE_SIZE_BYTE); + uint64_t nnode = setup_pointer_tracing_linklist(_PERF_TEST_ADDR_BASE, _PERF_TEST_ADDR_BASE + size, step); _perf_end_timer(); uint64_t total_node = nnode * iter; // _perf_print_timer(); _perf_start_timer(); for (int i = 0; i < iter; i++) { - result += read_latency_test_linklist(_PERF_TEST_ADDR_BASE, nnode); + result += read_pointer_tracing_linklist(_PERF_TEST_ADDR_BASE, nnode); } _perf_end_timer(); // _perf_print_timer(); - printf("range 0x%xB (%d iters) read latency %f (%d samples)\n", size, iter, (float)perf.cycle / total_node, total_node); + if (to_csv) { + printf("%ld, %f, %d, %ld, %ld\n", size, (float)perf.cycle / total_node, iter, total_node, perf.cycle); + } else { + printf("range %ldKB (%d iters) pointer tracing read latency %f, throughput %f B/cycle (%ld samples, %ld cycles)\n", + size/KB, iter, (float)perf.cycle / total_node, total_node * 8 * BYTE / (float)perf.cycle, total_node, perf.cycle + ); + } + + _perf_blackhole(result); + _perf_g_total_samples += total_node; +} + +void test_same_address_load_latency(int iter, int to_csv) +{ + // printf("same address load latency test\n", step); + // printf("range (B), read latency, iters, samples, cycles\n"); + volatile uint64_t result = 0; // make sure compiler will not opt read_pointer_tracing_linklist + // _perf_print_timer(); + + _perf_start_timer(); + uint64_t address = _PERF_TEST_ADDR_BASE; + for (int i = 0; i < iter; i++) { + result += *((uint64_t*) (address)); + } + _perf_end_timer(); + // _perf_print_timer(); + uint64_t total_access = iter; + if (to_csv) { + printf("%ld, %f, %d, %ld, %ld\n", 0, (float)perf.cycle / total_access, iter, total_access, perf.cycle); + } else { + printf("same address read latency %f, throughput %f B/cycle (%ld samples, %ld cycles)\n", + (float)perf.cycle / total_access, total_access * 8 * BYTE / (float)perf.cycle, total_access, perf.cycle + ); + } + + _perf_blackhole(result); + _perf_g_total_samples += total_access; +} + +void test_linear_access_latency(uint64_t size, uint64_t step, int iter, int to_csv) +{ + // printf("stride %d linear access latency test\n", step); + // printf("range (B), read latency, iters, samples, cycles\n"); + volatile uint64_t result = 0; // make sure compiler will not opt read_pointer_tracing_linklist + uint64_t num_access = size / step; + // _perf_print_timer(); + + _perf_start_timer(); + uint64_t address = _PERF_TEST_ADDR_BASE; + for (int i = 0; i < iter; i++) { + for (int j = 0; j < num_access; j++) { + result += *((uint64_t*) (address)); + address += step; + } + } + _perf_end_timer(); + // _perf_print_timer(); + uint64_t total_access = num_access * iter; + if (to_csv) { + printf("%ld, %f, %d, %ld, %ld\n", size, (float)perf.cycle / total_access, iter, total_access, perf.cycle); + } else { + printf("range %ldKB (%d iters) linear read latency %f, throughput %f B/cycle (%ld samples, %ld cycles), stride %dB\n", + size/KB, iter, (float)perf.cycle / total_access, total_access * 8 * BYTE / (float)perf.cycle, total_access, perf.cycle, step + ); + } + + _perf_blackhole(result); + _perf_g_total_samples += total_access; +} + +void test_random_access_latency(uint64_t num_access, uint64_t test_range, uint64_t test_align, int pregen_addr, int iter, int to_csv) +{ + // printf("align %d random access (cache line) latency test, %s\n", + // test_align, pregen_addr ? "use pregen addr array" : "gen rand addr at run time" + // ); + // printf("range (B), read latency, iters, samples, cycles\n"); + volatile uint64_t result = 0; // make sure compiler will not opt read_pointer_tracing_linklist + // _perf_print_timer(); + + // alloc memory for random access addr array and data + assert(test_align >= 8 * BYTE); + // assert(size >= test_align); + // uint64_t num_access = size / test_align; + if (pregen_addr) { + uint64_t test_array_base_addr = _PERF_TEST_ADDR_BASE + num_access * sizeof(uint64_t*); + uint64_t address_array_base_addr = _PERF_TEST_ADDR_BASE; + generate_rand_address_array((uint64_t*)address_array_base_addr, test_array_base_addr, test_array_base_addr + test_range, test_align, num_access); + _perf_start_timer(); + for (int i = 0; i < iter; i++) { + for (int j = 0; j < num_access; j++) { + result += *((uint64_t*) (address_array_base_addr + j * sizeof(uint64_t*))); + } + } + _perf_end_timer(); + } else { + _perf_start_timer(); + for (int i = 0; i < iter; i++) { + for (int j = 0; j < num_access; j++) { + result += *((uint64_t*) (generate_rand_address(_PERF_TEST_ADDR_BASE, _PERF_TEST_ADDR_BASE + test_range, test_align))); + } + } + _perf_end_timer(); + } + // _perf_print_timer(); + uint64_t total_access = num_access * iter; + if (to_csv) { + printf("%ld, %f, %d, %ld, %ld\n", test_range, (float)perf.cycle / total_access, iter, total_access, perf.cycle); + } else { + printf("range %ldKB, access cover %ldKB (%d iters) random read latency %f, throughput %f B/cycle (%ld samples, %ld cycles), align %ldB, %s\n", + test_range/KB, total_access*8*_PERF_CACHELINE_SIZE_BYTE/KB, iter, (float)perf.cycle / (total_access), total_access * 8 * BYTE / (float)perf.cycle, total_access, perf.cycle, test_align, + pregen_addr ? "pregen addr" : "runtime addr" + ); + } _perf_blackhole(result); + _perf_g_total_samples += total_access; } -void test_mem_throughput(uint64_t iter) +void legacy_test_mem_throughput(uint64_t iter) { uint64_t remain = iter; uint64_t result = 0; @@ -64,7 +196,7 @@ void test_mem_throughput(uint64_t iter) printf("mem band width %f B/cycle (%d samples)\n", (float)iter * _PERF_CACHELINE_SIZE_BYTE / perf.cycle, iter); } -void test_mem_throughput_same_set(uint64_t iter) +void legacy_test_mem_throughput_same_set(uint64_t iter) { uint64_t remain = iter; uint64_t result = 0; @@ -72,7 +204,7 @@ void test_mem_throughput_same_set(uint64_t iter) _perf_start_timer(); while (remain--) { result += *(uint64_t*) access_addr; - access_addr += _PERF_SET_SIZE_BYTE; + access_addr += _PERF_ADDR_STRIDE_L1_SAME_SET; } _perf_end_timer(); *(uint64_t*) _PERF_BLACKHOLE = result; diff --git a/apps/maprobe/main.c b/apps/maprobe/main.c index 16563d78..9b64ce9a 100644 --- a/apps/maprobe/main.c +++ b/apps/maprobe/main.c @@ -1,24 +1,180 @@ #include #include "maprobe.h" -int main() +void typical_linear_load_test_set() +{ + _perf_calibrate(); + printf("------------- linear load test set -------------\n"); + printf("page size linear double word load:\n"); + test_linear_access_latency(_PERF_PAGE_SIZE_BYTE, sizeof(uint64_t), 1, 0); + test_linear_access_latency(_PERF_PAGE_SIZE_BYTE, sizeof(uint64_t), 2, 0); + printf("page size linear cache line load:\n"); + test_linear_access_latency(_PERF_PAGE_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 1, 0); + test_linear_access_latency(_PERF_PAGE_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 2, 0); + printf("dcache/2 linear double word load:\n"); + test_linear_access_latency(_PERF_L1_SIZE_BYTE / 2, sizeof(uint64_t), 1, 0); + test_linear_access_latency(_PERF_L1_SIZE_BYTE / 2, sizeof(uint64_t), 2, 0); + printf("dcache/2 linear cache line load:\n"); + test_linear_access_latency(_PERF_L1_SIZE_BYTE / 2, _PERF_CACHELINE_SIZE_BYTE, 1, 0); + test_linear_access_latency(_PERF_L1_SIZE_BYTE / 2, _PERF_CACHELINE_SIZE_BYTE, 2, 0); + printf("dcache linear double word load:\n"); + test_linear_access_latency(_PERF_L1_SIZE_BYTE, sizeof(uint64_t), 1, 0); + test_linear_access_latency(_PERF_L1_SIZE_BYTE, sizeof(uint64_t), 2, 0); + printf("dcache linear cache line load:\n"); + test_linear_access_latency(_PERF_L1_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 1, 0); + test_linear_access_latency(_PERF_L1_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 2, 0); + printf("L2 linear cache line load:\n"); + test_linear_access_latency(_PERF_L2_SIZE_BYTE / 2, _PERF_CACHELINE_SIZE_BYTE, 1, 0); + test_linear_access_latency(_PERF_L2_SIZE_BYTE / 2, _PERF_CACHELINE_SIZE_BYTE, 2, 0); + printf("L1 (L1 same set) linear cache line load:\n"); + test_linear_access_latency(_PERF_L1_SIZE_BYTE, _PERF_ADDR_STRIDE_L1_SAME_SET, 1, 0); + test_linear_access_latency(_PERF_L1_SIZE_BYTE, _PERF_ADDR_STRIDE_L1_SAME_SET, 10, 0); + printf("L2 (L1 same set) linear cache line load:\n"); + test_linear_access_latency(_PERF_L2_SIZE_BYTE, _PERF_ADDR_STRIDE_L1_SAME_SET, 1, 0); + test_linear_access_latency(_PERF_L2_SIZE_BYTE, _PERF_ADDR_STRIDE_L1_SAME_SET, 2, 0); + printf("L1 (L2 same slice) linear cache line load:\n"); + test_linear_access_latency(_PERF_L1_SIZE_BYTE, _PERF_ADDR_STRIDE_L2_SAME_SLICE, 1, 0); + test_linear_access_latency(_PERF_L1_SIZE_BYTE, _PERF_ADDR_STRIDE_L2_SAME_SLICE, 2, 0); + printf("L2 (L2 same slice) linear cache line load:\n"); + test_linear_access_latency(_PERF_L2_SIZE_BYTE, _PERF_ADDR_STRIDE_L2_SAME_SLICE, 1, 0); + test_linear_access_latency(_PERF_L2_SIZE_BYTE, _PERF_ADDR_STRIDE_L2_SAME_SLICE, 2, 0); + printf("L1 (page traverse) linear cache line load:\n"); + test_linear_access_latency(_PERF_L1_SIZE_BYTE, _PERF_ADDR_STRIDE_NEXT_PAGE, 1, 0); + test_linear_access_latency(_PERF_L1_SIZE_BYTE, _PERF_ADDR_STRIDE_NEXT_PAGE, 10, 0); + printf("L2 (page traverse) linear cache line load:\n"); + test_linear_access_latency(_PERF_L2_SIZE_BYTE, _PERF_ADDR_STRIDE_NEXT_PAGE, 1, 0); + test_linear_access_latency(_PERF_L2_SIZE_BYTE, _PERF_ADDR_STRIDE_NEXT_PAGE, 2, 0); + printf("total samples: %ld\n", _perf_g_total_samples); +} + +void typical_random_load_test_set() +{ + printf("------------- random load test set -------------\n"); + printf("from page size random load (word):\n"); + test_random_access_latency(1024, _PERF_PAGE_SIZE_BYTE, 8*BYTE, 1, 1, 0); + test_random_access_latency(1024, _PERF_PAGE_SIZE_BYTE, 8*BYTE, 0, 1, 0); + printf("from page size random load (cache line):\n"); + test_random_access_latency(1024, _PERF_PAGE_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 1, 1, 0); + test_random_access_latency(1024, _PERF_PAGE_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 0, 1, 0); + printf("from dcache/2 size random load (word):\n"); + test_random_access_latency(1024, _PERF_L1_SIZE_BYTE/2, 8*BYTE, 1, 1, 0); + test_random_access_latency(1024, _PERF_L1_SIZE_BYTE/2, 8*BYTE, 0, 1, 0); + printf("from dcache/2 size random load (cache line):\n"); + test_random_access_latency(1024, _PERF_L1_SIZE_BYTE/2, _PERF_CACHELINE_SIZE_BYTE, 1, 1, 0); + test_random_access_latency(1024, _PERF_L1_SIZE_BYTE/2, _PERF_CACHELINE_SIZE_BYTE, 0, 1, 0); + printf("from dcache size random load (word):\n"); + test_random_access_latency(_PERF_L1_SIZE_BYTE/_PERF_CACHELINE_SIZE_BYTE*2, _PERF_L1_SIZE_BYTE, 8*BYTE, 1, 1, 0); + test_random_access_latency(_PERF_L1_SIZE_BYTE/_PERF_CACHELINE_SIZE_BYTE*2, _PERF_L1_SIZE_BYTE, 8*BYTE, 0, 1, 0); + printf("from dcache size random load (cache line):\n"); + test_random_access_latency(_PERF_L1_SIZE_BYTE/_PERF_CACHELINE_SIZE_BYTE*2, _PERF_L1_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 1, 1, 0); + test_random_access_latency(_PERF_L1_SIZE_BYTE/_PERF_CACHELINE_SIZE_BYTE*2, _PERF_L1_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 0, 1, 0); + printf("from l2 size random load (word):\n"); + test_random_access_latency(_PERF_L1_SIZE_BYTE/_PERF_CACHELINE_SIZE_BYTE*2, _PERF_L2_SIZE_BYTE, 8*BYTE, 1, 1, 0); + test_random_access_latency(_PERF_L1_SIZE_BYTE/_PERF_CACHELINE_SIZE_BYTE*2, _PERF_L2_SIZE_BYTE, 8*BYTE, 0, 1, 0); + printf("from l2 size random load (cache line):\n"); + test_random_access_latency(_PERF_L1_SIZE_BYTE/_PERF_CACHELINE_SIZE_BYTE*2, _PERF_L2_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 1, 1, 0); + test_random_access_latency(_PERF_L1_SIZE_BYTE/_PERF_CACHELINE_SIZE_BYTE*2, _PERF_L2_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 0, 1, 0); + printf("total samples: %ld\n", _perf_g_total_samples); +} + +void typical_pointer_tracing_load_test_set() +{ + printf("------------- pointer tracing load test set -------------\n"); + printf("cacheline by cacheline tracing:\n"); + test_pointer_tracing_latency(_PERF_PAGE_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 10, 0); + test_pointer_tracing_latency(_PERF_L1_SIZE_BYTE/2, _PERF_CACHELINE_SIZE_BYTE, 2, 0); + test_pointer_tracing_latency(_PERF_L1_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 2, 0); + test_pointer_tracing_latency(_PERF_L2_SIZE_BYTE/2, _PERF_CACHELINE_SIZE_BYTE, 2, 0); + test_pointer_tracing_latency(_PERF_L2_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 1, 0); + test_pointer_tracing_latency(_PERF_L3_SIZE_BYTE/2, _PERF_CACHELINE_SIZE_BYTE, 1, 0); + printf("page by page, tracing:\n"); + test_pointer_tracing_latency(_PERF_PAGE_SIZE_BYTE*2, _PERF_PAGE_SIZE_BYTE, 10, 0); + test_pointer_tracing_latency(_PERF_L1_SIZE_BYTE/2, _PERF_PAGE_SIZE_BYTE, 10, 0); + test_pointer_tracing_latency(_PERF_L1_SIZE_BYTE, _PERF_PAGE_SIZE_BYTE, 10, 0); + test_pointer_tracing_latency(_PERF_L2_SIZE_BYTE/2, _PERF_PAGE_SIZE_BYTE, 10, 0); + test_pointer_tracing_latency(_PERF_L2_SIZE_BYTE, _PERF_PAGE_SIZE_BYTE, 10, 0); + printf("total samples: %ld\n", _perf_g_total_samples); +} + +void typical_memory_disambiuation_test_set() +{ + printf("------------- memory disambiuation test set -------------\n"); + printf("load from the same address:\n"); + test_same_address_load_latency(1024, 0); + test_same_address_load_latency(1024, 0); + test_same_address_load_latency(1024, 0); + // more to be added +} + +// typical latency test for fast regression +void typical_latency_test() +{ + _perf_g_total_samples = 0; + typical_linear_load_test_set(); + typical_random_load_test_set(); + typical_pointer_tracing_load_test_set(); + typical_memory_disambiuation_test_set(); +} + +void pointer_tracing_graph() +{ + _perf_g_total_samples = 0; + _perf_calibrate(); + printf("data for pointer tracing latency graph:\n"); + printf("range (B), read latency, iters, samples\n"); + for (int i = 1*KB; i < 64*KB; i = i + 1*KB) { + test_pointer_tracing_latency(i, _PERF_CACHELINE_SIZE_BYTE, 2, 1); + } + for (int i = 64*KB; i < 1024*KB; i = i + 64*KB) { + test_pointer_tracing_latency(i, _PERF_CACHELINE_SIZE_BYTE, 1, 1); + } + test_pointer_tracing_latency(1024*KB, _PERF_CACHELINE_SIZE_BYTE, 1, 1); + for (int i = 1*MB; i <8*MB; i = i + 1*MB) { + test_pointer_tracing_latency(i, _PERF_CACHELINE_SIZE_BYTE, 1, 1); + } + printf("total samples: %ld\n", _perf_g_total_samples); +} + +// a simple test set used to check if test is working correctly +void latency_test_example() +{ + _perf_calibrate(); + printf("latency test example:\n"); + test_pointer_tracing_latency(_PERF_PAGE_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 5, 0); + test_linear_access_latency(_PERF_PAGE_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 5, 0); + test_random_access_latency(4096, 1024*MB, _PERF_CACHELINE_SIZE_BYTE, 0, 1, 0); + test_random_access_latency(4096, 1024*MB, _PERF_CACHELINE_SIZE_BYTE, 1, 1, 0); + test_same_address_load_latency(1024, 0); + printf("total samples: %ld\n", _perf_g_total_samples); +} + +void legacy_latency_throughput_test() { _perf_calibrate(); printf("Memory throughput:\n"); - test_mem_throughput(512); + legacy_test_mem_throughput(1024); printf("L1 latency:\n"); - test_latency(4 * KB, 5); - test_latency(_PERF_L1_NOALIAS_SIZE_BYTE, 2); - test_latency(_PERF_L1_SIZE_BYTE/2, 2); - test_latency(_PERF_L1_SIZE_BYTE, 2); + test_pointer_tracing_latency(_PERF_PAGE_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 5, 0); + test_pointer_tracing_latency(_PERF_L1_NOALIAS_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 2, 0); + test_pointer_tracing_latency(_PERF_L1_SIZE_BYTE/2, _PERF_CACHELINE_SIZE_BYTE, 2, 0); + test_pointer_tracing_latency(_PERF_L1_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 2, 0); printf("L2 latency:\n"); - test_latency(_PERF_L2_SIZE_BYTE/2, 2); - // test_latency(_PERF_L2_SIZE_BYTE, 2); + test_pointer_tracing_latency(_PERF_L2_SIZE_BYTE/2, _PERF_CACHELINE_SIZE_BYTE, 2, 0); + // test_pointer_tracing_latency(_PERF_L2_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 2, 0); printf("L3 latency:\n"); - test_latency(_PERF_L3_SIZE_BYTE/2, 2); - // test_latency(_PERF_L3_SIZE_BYTE,2); + test_pointer_tracing_latency(_PERF_L3_SIZE_BYTE/2, _PERF_CACHELINE_SIZE_BYTE, 2, 0); + // test_pointer_tracing_latency(_PERF_L3_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE,2, 0); // printf("MEM:\n"); - // test_latency(_PERF_L3_SIZE_BYTE*2,2); + // test_pointer_tracing_latency(_PERF_L3_SIZE_BYTE*2, _PERF_CACHELINE_SIZE_BYTE,2, 0); + printf("total samples: %ld\n", _perf_g_total_samples); +} +int main() +{ + latency_test_example(); + typical_latency_test(); + // pointer_tracing_graph(); + // latency_test(); + // legacy_latency_throughput_test(); return 0; } \ No newline at end of file -- GitLab