diff --git a/apps/maprobe/include/maprobe.h b/apps/maprobe/include/maprobe.h index b17ea0ba7e3b1f879cdd1543ec328a4731f4ba3f..0f8ba26f9b8d508796d7d045f6f6ff738dbce0d2 100644 --- a/apps/maprobe/include/maprobe.h +++ b/apps/maprobe/include/maprobe.h @@ -71,6 +71,7 @@ extern void test_pointer_tracing_latency(uint64_t size, int step, int iter, int extern void test_linear_access_latency(uint64_t size, uint64_t step, int iter, int to_csv); extern void test_random_access_latency(uint64_t num_access, uint64_t test_range, uint64_t test_align, int pregen_addr, int iter, int to_csv); extern void test_same_address_load_latency(int iter, int to_csv); +extern void test_read_after_write_latency(int iter, int to_csv); extern void legacy_test_mem_throughput(uint64_t iter); extern void legacy_test_mem_throughput_same_set(uint64_t iter); diff --git a/apps/maprobe/latency-test.c b/apps/maprobe/latency-test.c index ad152388f29f540e929972efe70e77f7974eb290..04f0e1a96678014baf24dee756a2ed4a6cca642b 100644 --- a/apps/maprobe/latency-test.c +++ b/apps/maprobe/latency-test.c @@ -50,7 +50,7 @@ void test_pointer_tracing_latency(uint64_t size, int step, int iter, int to_csv) { // printf("pointer tracing latency test\n"); // printf("range (B), read latency, iters, samples, cycles\n"); - volatile uint64_t result = 0; // make sure compiler will not opt read_pointer_tracing_linklist + register uint64_t result = 0; // make sure compiler will not opt read_pointer_tracing_linklist _perf_start_timer(); uint64_t nnode = setup_pointer_tracing_linklist(_PERF_TEST_ADDR_BASE, _PERF_TEST_ADDR_BASE + size, step); _perf_end_timer(); @@ -79,13 +79,13 @@ void test_same_address_load_latency(int iter, int to_csv) { // printf("same address load latency test\n", step); // printf("range (B), read latency, iters, samples, cycles\n"); - volatile uint64_t result = 0; // make sure compiler will not opt read_pointer_tracing_linklist + register uint64_t result = 0; // make sure compiler will not opt read_pointer_tracing_linklist // _perf_print_timer(); _perf_start_timer(); uint64_t address = _PERF_TEST_ADDR_BASE; for (int i = 0; i < iter; i++) { - result += *((uint64_t*) (address)); + result += *((volatile uint64_t*) (address)); } _perf_end_timer(); // _perf_print_timer(); @@ -102,11 +102,39 @@ void test_same_address_load_latency(int iter, int to_csv) _perf_g_total_samples += total_access; } +void test_read_after_write_latency(int iter, int to_csv) +{ + // printf("same address store-load latency test\n", step); + // printf("range (B), read latency, iters, samples, cycles\n"); + volatile uint64_t result = 0; // make sure compiler will store data to memory + // _perf_print_timer(); + + _perf_start_timer(); + uint64_t address = _PERF_TEST_ADDR_BASE; + for (int i = 0; i < iter; i++) { + result += *((uint64_t*) (address)); + address += sizeof(uint64_t); + } + _perf_end_timer(); + // _perf_print_timer(); + uint64_t total_access = iter; + if (to_csv) { + printf("%ld, %f, %d, %ld, %ld\n", 0, (float)perf.cycle / total_access, iter, total_access, perf.cycle); + } else { + printf("read after write latency %f, throughput %f B/cycle (%ld samples, %ld cycles)\n", + (float)perf.cycle / total_access, total_access * 8 * BYTE / (float)perf.cycle, total_access, perf.cycle + ); + } + + _perf_blackhole(result); + _perf_g_total_samples += total_access; +} + void test_linear_access_latency(uint64_t size, uint64_t step, int iter, int to_csv) { // printf("stride %d linear access latency test\n", step); // printf("range (B), read latency, iters, samples, cycles\n"); - volatile uint64_t result = 0; // make sure compiler will not opt read_pointer_tracing_linklist + register uint64_t result = 0; // make sure compiler will not opt read_pointer_tracing_linklist uint64_t num_access = size / step; // _perf_print_timer(); @@ -139,7 +167,7 @@ void test_random_access_latency(uint64_t num_access, uint64_t test_range, uint64 // test_align, pregen_addr ? "use pregen addr array" : "gen rand addr at run time" // ); // printf("range (B), read latency, iters, samples, cycles\n"); - volatile uint64_t result = 0; // make sure compiler will not opt read_pointer_tracing_linklist + register uint64_t result = 0; // make sure compiler will not opt read_pointer_tracing_linklist // _perf_print_timer(); // alloc memory for random access addr array and data diff --git a/apps/maprobe/main.c b/apps/maprobe/main.c index 9b64ce9afdfdc10ad3db33f7eb1b3b66ed74bf32..7d6ee3049ddc45ddcdbc50e7b87cd4f3816dfc9c 100644 --- a/apps/maprobe/main.c +++ b/apps/maprobe/main.c @@ -27,11 +27,11 @@ void typical_linear_load_test_set() test_linear_access_latency(_PERF_L2_SIZE_BYTE / 2, _PERF_CACHELINE_SIZE_BYTE, 1, 0); test_linear_access_latency(_PERF_L2_SIZE_BYTE / 2, _PERF_CACHELINE_SIZE_BYTE, 2, 0); printf("L1 (L1 same set) linear cache line load:\n"); - test_linear_access_latency(_PERF_L1_SIZE_BYTE, _PERF_ADDR_STRIDE_L1_SAME_SET, 1, 0); test_linear_access_latency(_PERF_L1_SIZE_BYTE, _PERF_ADDR_STRIDE_L1_SAME_SET, 10, 0); + test_linear_access_latency(_PERF_L1_SIZE_BYTE, _PERF_ADDR_STRIDE_L1_SAME_SET, 100, 0); printf("L2 (L1 same set) linear cache line load:\n"); - test_linear_access_latency(_PERF_L2_SIZE_BYTE, _PERF_ADDR_STRIDE_L1_SAME_SET, 1, 0); test_linear_access_latency(_PERF_L2_SIZE_BYTE, _PERF_ADDR_STRIDE_L1_SAME_SET, 2, 0); + test_linear_access_latency(_PERF_L2_SIZE_BYTE, _PERF_ADDR_STRIDE_L1_SAME_SET, 4, 0); printf("L1 (L2 same slice) linear cache line load:\n"); test_linear_access_latency(_PERF_L1_SIZE_BYTE, _PERF_ADDR_STRIDE_L2_SAME_SLICE, 1, 0); test_linear_access_latency(_PERF_L1_SIZE_BYTE, _PERF_ADDR_STRIDE_L2_SAME_SLICE, 2, 0); @@ -39,11 +39,11 @@ void typical_linear_load_test_set() test_linear_access_latency(_PERF_L2_SIZE_BYTE, _PERF_ADDR_STRIDE_L2_SAME_SLICE, 1, 0); test_linear_access_latency(_PERF_L2_SIZE_BYTE, _PERF_ADDR_STRIDE_L2_SAME_SLICE, 2, 0); printf("L1 (page traverse) linear cache line load:\n"); - test_linear_access_latency(_PERF_L1_SIZE_BYTE, _PERF_ADDR_STRIDE_NEXT_PAGE, 1, 0); test_linear_access_latency(_PERF_L1_SIZE_BYTE, _PERF_ADDR_STRIDE_NEXT_PAGE, 10, 0); + test_linear_access_latency(_PERF_L1_SIZE_BYTE, _PERF_ADDR_STRIDE_NEXT_PAGE, 100, 0); printf("L2 (page traverse) linear cache line load:\n"); - test_linear_access_latency(_PERF_L2_SIZE_BYTE, _PERF_ADDR_STRIDE_NEXT_PAGE, 1, 0); test_linear_access_latency(_PERF_L2_SIZE_BYTE, _PERF_ADDR_STRIDE_NEXT_PAGE, 2, 0); + test_linear_access_latency(_PERF_L2_SIZE_BYTE, _PERF_ADDR_STRIDE_NEXT_PAGE, 4, 0); printf("total samples: %ld\n", _perf_g_total_samples); } @@ -103,6 +103,10 @@ void typical_memory_disambiuation_test_set() test_same_address_load_latency(1024, 0); test_same_address_load_latency(1024, 0); test_same_address_load_latency(1024, 0); + printf("load then store to the same address:\n"); + test_read_after_write_latency(1024, 0); + test_read_after_write_latency(1024, 0); + test_read_after_write_latency(1024, 0); // more to be added } @@ -141,10 +145,12 @@ void latency_test_example() _perf_calibrate(); printf("latency test example:\n"); test_pointer_tracing_latency(_PERF_PAGE_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 5, 0); + test_linear_access_latency(_PERF_PAGE_SIZE_BYTE, sizeof(uint64_t), 5, 0); test_linear_access_latency(_PERF_PAGE_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 5, 0); test_random_access_latency(4096, 1024*MB, _PERF_CACHELINE_SIZE_BYTE, 0, 1, 0); test_random_access_latency(4096, 1024*MB, _PERF_CACHELINE_SIZE_BYTE, 1, 1, 0); test_same_address_load_latency(1024, 0); + test_read_after_write_latency(1024, 0); printf("total samples: %ld\n", _perf_g_total_samples); }