diff --git a/apps/maprobe/Makefile b/apps/maprobe/Makefile index c36a3029bc23756a1bee6417d9c3083c8b3fe60b..2bcc359067f8209ebe02181e70d96f3442148a12 100644 --- a/apps/maprobe/Makefile +++ b/apps/maprobe/Makefile @@ -1,3 +1,3 @@ NAME = maprobe -SRCS = maprobe.c +SRCS = common.c bitutils.c latency-test.c main.c include $(AM_HOME)/Makefile.app diff --git a/apps/maprobe/README.md b/apps/maprobe/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e6f712220831019e2bb7a24bafd69022642c266b --- /dev/null +++ b/apps/maprobe/README.md @@ -0,0 +1,3 @@ +# Micro Architecture Probe (MAProbe) + +* Memory access latency test \ No newline at end of file diff --git a/apps/maprobe/bitutils.c b/apps/maprobe/bitutils.c new file mode 100644 index 0000000000000000000000000000000000000000..cfad50f8ce951e25f393a6dd8befeaca3d189614 --- /dev/null +++ b/apps/maprobe/bitutils.c @@ -0,0 +1,36 @@ +#include "bitutils.h" + +inline uint64_t _perf_get_bit_mask(int low, int high) { + assert(low < high); + assert(low >= 0); + assert(high < 63); + return ((1 << high) - 1) >> low << low; +} + +inline uint64_t _perf_get_bits(uint64_t raw_data, int low, int high) { + assert(low < high); + assert(low >= 0); + assert(high < 63); + uint64_t mask = (1 << high) - 1; + return (raw_data & mask) >> low; +} + +inline uint64_t _perf_get_bit(uint64_t raw_data, int position) { + assert(position >= 0); + assert(position <= 63); + return (raw_data >> position) & 1; +} + +inline uint64_t _perf_set_bits(uint64_t raw_data, int low, int high, uint64_t new_value) { + assert(low < high); + assert(low >= 0); + assert(high < 63); + int mask = _perf_get_bit_mask(low, high); + return (raw_data & mask) | ((new_value << low) & mask); +} + +inline uint64_t _perf_set_bit(uint64_t raw_data, int position, int new_value) { + assert(position >= 0); + assert(position <= 63); + return raw_data & ((new_value & 1) << position); +} diff --git a/apps/maprobe/common.c b/apps/maprobe/common.c new file mode 100644 index 0000000000000000000000000000000000000000..9cd55a622ce606e24b641858f26c4e61c0eac56a --- /dev/null +++ b/apps/maprobe/common.c @@ -0,0 +1,48 @@ +#include "maprobe.h" + +struct perf perf; + +void _perf_start_timer() +{ +#ifndef PERF_SIM + perf.cycle = csr_read(CSR_MCYCLE); + perf.instrcnt = csr_read(CSR_MINSTRET); +#endif +} + +void _perf_end_timer() +{ +#ifndef PERF_SIM + perf.cycle = csr_read(CSR_MCYCLE) - perf.cycle; + perf.instrcnt = csr_read(CSR_MINSTRET) - perf.instrcnt; +#endif +} + +void _perf_print_timer() +{ + printf("cycle %d inst %d ipc %lf\n", perf.cycle, perf.instrcnt, (float)perf.instrcnt/perf.cycle); +} + +void _perf_calibrate() +{ +#ifndef PERF_SIM + // csr read delay + uint64_t cycle_1 = csr_read(CSR_MCYCLE); + uint64_t cycle_2 = csr_read(CSR_MCYCLE); + perf.csr_read_cycle = cycle_2-cycle_1; + printf("perf_calibrate: csr_read_cycle %d\n", perf.csr_read_cycle); + + // csr read inst cost + uint64_t inst_1 = csr_read(CSR_MINSTRET); + uint64_t inst_2 = csr_read(CSR_MINSTRET); + perf.csr_read_ninst = inst_2-inst_1; + printf("perf_calibrate: csr_read_ninst %d\n", perf.csr_read_ninst); +#else + printf("running in simulation environment, hpm read disabled\n"); +#endif +} + +void _perf_blackhole(uint64_t value) +{ + *(uint64_t*) _PERF_BLACKHOLE = value; +} diff --git a/apps/maprobe/include/bitutils.h b/apps/maprobe/include/bitutils.h new file mode 100644 index 0000000000000000000000000000000000000000..f3b7f54be8711d5558f8905af664d85c3012972f --- /dev/null +++ b/apps/maprobe/include/bitutils.h @@ -0,0 +1,14 @@ +// bit op utils for perf + +#ifndef PROBE_BITUTILS_H +#define PROBE_BITUTILS_H + +#include + +extern uint64_t _perf_get_bit_mask(int low, int high); +extern uint64_t _perf_get_bits(uint64_t raw_data, int low, int high); +extern uint64_t _perf_get_bit(uint64_t raw_data, int position); +extern uint64_t _perf_set_bits(uint64_t raw_data, int low, int high, uint64_t new_value); +extern uint64_t _perf_set_bit(uint64_t raw_data, int position, int new_value); + +#endif \ No newline at end of file diff --git a/apps/maprobe/include/maprobe.h b/apps/maprobe/include/maprobe.h index ace1682c2e3f023e9c2dd7cef841e3ed2b43064b..27a170dae43cb68f3c6c15944e4f0bf06b4f0015 100644 --- a/apps/maprobe/include/maprobe.h +++ b/apps/maprobe/include/maprobe.h @@ -5,6 +5,10 @@ #include #include +#include "bitutils.h" + +// config +// #define PERF_SIM // probe run in simulatior, diaable perf counters // perf const #define BYTE (1) @@ -13,14 +17,16 @@ #define GB (1024*MB) // platform dependent const -// #define _PERF_TEST_ADDR_BASE 0x80400000 -#define _PERF_TEST_ADDR_BASE 0x2000400000 +#ifndef _PERF_TEST_ADDR_BASE +#define _PERF_TEST_ADDR_BASE 0x80400000 +// #define _PERF_TEST_ADDR_BASE 0x2000400000 +#endif #define _PERF_CACHELINE_SIZE_BYTE (64 * BYTE) #define _PERF_L1_NOALIAS_SIZE_BYTE (32 * KB) -#define _PERF_L1_SIZE_BYTE (128 * KB) +#define _PERF_L1_SIZE_BYTE (64 * KB) #define _PERF_L2_SIZE_BYTE (512 * KB) #define _PERF_L3_SIZE_BYTE (2 * MB) -#define _PERF_L1_NUM_WAYS 8 +#define _PERF_L1_NUM_WAYS 4 #define _PERF_SET_SIZE_BYTE (_PERF_L1_SIZE_BYTE / _PERF_L1_NUM_WAYS) // probe const @@ -29,128 +35,29 @@ struct perf { // const to be calibrated at run time - uint64_t csr_read_cycle; //# of cycles to read mcycle + uint64_t csr_read_cycle; // # of cycles to read mcycle uint64_t csr_read_ninst; // # of inst needed to read minstret // timer uint64_t cycle; uint64_t instrcnt; -} perf; - -void _perf_start_timer() -{ - perf.cycle = csr_read(CSR_MCYCLE); - perf.instrcnt = csr_read(CSR_MINSTRET); -} - -void _perf_end_timer() -{ - perf.cycle = csr_read(CSR_MCYCLE) - perf.cycle; - perf.instrcnt = csr_read(CSR_MINSTRET) - perf.instrcnt; -} - -void _perf_print_timer() -{ - printf("cycle %d inst %d ipc %lf\n", perf.cycle, perf.instrcnt, (float)perf.instrcnt/perf.cycle); -} - -void _perf_calibrate() -{ - // csr read delay - uint64_t cycle_1 = csr_read(CSR_MCYCLE); - uint64_t cycle_2 = csr_read(CSR_MCYCLE); - perf.csr_read_cycle = cycle_2-cycle_1; - printf("perf_calibrate: csr_read_cycle %d\n", perf.csr_read_cycle); - - // csr read inst cost - uint64_t inst_1 = csr_read(CSR_MINSTRET); - uint64_t inst_2 = csr_read(CSR_MINSTRET); - perf.csr_read_ninst = inst_2-inst_1; - printf("perf_calibrate: csr_read_ninst %d\n", perf.csr_read_ninst); -} - -void _perf_blackhole(uint64_t value) -{ - *(uint64_t*) _PERF_BLACKHOLE = value; -} - -uint64_t setup_latency_test_linklist(uint64_t base_addr, uint64_t end_addr, uint64_t step) -{ - uint64_t num_valid_node = 0; - assert(step % 8 == 0); - assert(step >= 8); - for (uint64_t cur_addr = base_addr; cur_addr < end_addr;) { - uint64_t next_addr = cur_addr + step; - *((uint64_t*)cur_addr) = next_addr; - cur_addr = next_addr; - num_valid_node++; - } - return num_valid_node; -} - -uint64_t read_latency_test_linklist(uint64_t base_addr, uint64_t num_valid_node) -{ - uint64_t cur_addr = base_addr; - for (int i = 0; i < num_valid_node; i++) { - cur_addr = (*(uint64_t*)cur_addr); - } - return cur_addr; -} - -void warmup(uint64_t base_addr, uint64_t end_addr) -{ - setup_latency_test_linklist(base_addr, end_addr, _PERF_CACHELINE_SIZE_BYTE); -} - -void test_latency(uint64_t size, int iter) -{ - volatile uint64_t result = 0; // make sure compiler will not opt read_latency_test_linklist - printf("range 0x%xB (%d iters) latency test\n", size, iter); - _perf_start_timer(); - uint64_t nnode = setup_latency_test_linklist(_PERF_TEST_ADDR_BASE, _PERF_TEST_ADDR_BASE + size, _PERF_CACHELINE_SIZE_BYTE); - _perf_end_timer(); - uint64_t total_node = nnode * iter; - // _perf_print_timer(); - - _perf_start_timer(); - for (int i = 0; i < iter; i++) { - result += read_latency_test_linklist(_PERF_TEST_ADDR_BASE, nnode); - } - _perf_end_timer(); - // _perf_print_timer(); - printf("range 0x%xB (%d intrs) read latency %f (%d samples)\n", size, iter, (float)perf.cycle / total_node, total_node); - - _perf_blackhole(result); -} - -void test_mem_throughput(uint64_t iter) -{ - uint64_t remain = iter; - uint64_t result = 0; - uint64_t access_addr = _PERF_TEST_ADDR_BASE; - _perf_start_timer(); - while (remain--) { - result += *(uint64_t*) access_addr; - access_addr += _PERF_CACHELINE_SIZE_BYTE; - } - _perf_end_timer(); - *(uint64_t*) _PERF_BLACKHOLE = result; - printf("mem band width %f B/cycle (%d samples)\n", (float)iter * _PERF_CACHELINE_SIZE_BYTE / perf.cycle, iter); -} - -void test_mem_throughput_same_set(uint64_t iter) -{ - uint64_t remain = iter; - uint64_t result = 0; - uint64_t access_addr = _PERF_TEST_ADDR_BASE; - _perf_start_timer(); - while (remain--) { - result += *(uint64_t*) access_addr; - access_addr += _PERF_SET_SIZE_BYTE; - } - _perf_end_timer(); - *(uint64_t*) _PERF_BLACKHOLE = result; - printf("mem band width %f B/cycle (%d samples)\n", (float)iter * _PERF_CACHELINE_SIZE_BYTE / perf.cycle, iter); -} +}; + +extern struct perf perf; + +// common perf tools +extern void _perf_start_timer(); +extern void _perf_end_timer(); +extern void _perf_print_timer(); +extern void _perf_calibrate(); +extern void _perf_blackhole(uint64_t value); + +// latency test +extern uint64_t setup_latency_test_linklist(uint64_t base_addr, uint64_t end_addr, uint64_t step); +extern uint64_t read_latency_test_linklist(uint64_t base_addr, uint64_t num_valid_node); +extern void latency_test_warmup(uint64_t base_addr, uint64_t end_addr); +extern void test_latency(uint64_t size, int iter); +extern void test_mem_throughput(uint64_t iter); +extern void test_mem_throughput_same_set(uint64_t iter); #endif \ No newline at end of file diff --git a/apps/maprobe/latency-test.c b/apps/maprobe/latency-test.c new file mode 100644 index 0000000000000000000000000000000000000000..570f11e879ebf280fd5ea7470b16e344c071df70 --- /dev/null +++ b/apps/maprobe/latency-test.c @@ -0,0 +1,80 @@ +#include "maprobe.h" + +uint64_t setup_latency_test_linklist(uint64_t base_addr, uint64_t end_addr, uint64_t step) +{ + uint64_t num_valid_node = 0; + assert(step % 8 == 0); + assert(step >= 8); + for (uint64_t cur_addr = base_addr; cur_addr < end_addr;) { + uint64_t next_addr = cur_addr + step; + *((uint64_t*)cur_addr) = next_addr; + cur_addr = next_addr; + num_valid_node++; + } + return num_valid_node; +} + +uint64_t read_latency_test_linklist(uint64_t base_addr, uint64_t num_valid_node) +{ + uint64_t cur_addr = base_addr; + for (int i = 0; i < num_valid_node; i++) { + cur_addr = (*(uint64_t*)cur_addr); + } + return cur_addr; +} + +void latency_test_warmup(uint64_t base_addr, uint64_t end_addr) +{ + setup_latency_test_linklist(base_addr, end_addr, _PERF_CACHELINE_SIZE_BYTE); +} + +void test_latency(uint64_t size, int iter) +{ + volatile uint64_t result = 0; // make sure compiler will not opt read_latency_test_linklist + printf("range 0x%xB (%d iters) latency test\n", size, iter); + _perf_start_timer(); + uint64_t nnode = setup_latency_test_linklist(_PERF_TEST_ADDR_BASE, _PERF_TEST_ADDR_BASE + size, _PERF_CACHELINE_SIZE_BYTE); + _perf_end_timer(); + uint64_t total_node = nnode * iter; + // _perf_print_timer(); + + _perf_start_timer(); + for (int i = 0; i < iter; i++) { + result += read_latency_test_linklist(_PERF_TEST_ADDR_BASE, nnode); + } + _perf_end_timer(); + // _perf_print_timer(); + printf("range 0x%xB (%d iters) read latency %f (%d samples)\n", size, iter, (float)perf.cycle / total_node, total_node); + + _perf_blackhole(result); +} + +void test_mem_throughput(uint64_t iter) +{ + uint64_t remain = iter; + uint64_t result = 0; + uint64_t access_addr = _PERF_TEST_ADDR_BASE; + _perf_start_timer(); + while (remain--) { + result += *(uint64_t*) access_addr; + access_addr += _PERF_CACHELINE_SIZE_BYTE; + } + _perf_end_timer(); + *(uint64_t*) _PERF_BLACKHOLE = result; + printf("mem band width %f B/cycle (%d samples)\n", (float)iter * _PERF_CACHELINE_SIZE_BYTE / perf.cycle, iter); +} + +void test_mem_throughput_same_set(uint64_t iter) +{ + uint64_t remain = iter; + uint64_t result = 0; + uint64_t access_addr = _PERF_TEST_ADDR_BASE; + _perf_start_timer(); + while (remain--) { + result += *(uint64_t*) access_addr; + access_addr += _PERF_SET_SIZE_BYTE; + } + _perf_end_timer(); + *(uint64_t*) _PERF_BLACKHOLE = result; + printf("mem band width %f B/cycle (%d samples)\n", (float)iter * _PERF_CACHELINE_SIZE_BYTE / perf.cycle, iter); +} diff --git a/apps/maprobe/maprobe.c b/apps/maprobe/main.c similarity index 100% rename from apps/maprobe/maprobe.c rename to apps/maprobe/main.c