diff --git a/apps/maprobe/Makefile b/apps/maprobe/Makefile
index c36a3029bc23756a1bee6417d9c3083c8b3fe60b..663c48ebef40f185ec9e528b55d95c76c9039367 100644
--- a/apps/maprobe/Makefile
+++ b/apps/maprobe/Makefile
@@ -1,3 +1,3 @@
 NAME = maprobe
-SRCS = maprobe.c
+SRCS = common.c bitutils.c resultmat.c latency-test.c bandwidth-test.c replacement-test.c main.c
 include $(AM_HOME)/Makefile.app
diff --git a/apps/maprobe/README.md b/apps/maprobe/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e6f712220831019e2bb7a24bafd69022642c266b
--- /dev/null
+++ b/apps/maprobe/README.md
@@ -0,0 +1,3 @@
+# Micro Architecture Probe (MAProbe)
+
+* Memory access latency test
\ No newline at end of file
diff --git a/apps/maprobe/bandwidth-test.c b/apps/maprobe/bandwidth-test.c
new file mode 100644
index 0000000000000000000000000000000000000000..549956d3c1027a95666301954ccf989a129b46d1
--- /dev/null
+++ b/apps/maprobe/bandwidth-test.c
@@ -0,0 +1,99 @@
+#include "maprobe.h"
+
+float test_l1_load_bandwidth(uint64_t size, int iter, int to_csv)
+{
+    // printf("stride %d linear access latency test\n", step);
+    // printf("range (B), read latency, iters, samples, cycles\n");
+    assert(size >= _PERF_CACHELINE_SIZE_BYTE);
+
+    // _perf_print_timer();
+    _perf_start_timer();
+    for (int i = 0; i < iter; i++) {
+        for (uint64_t address = _PERF_TEST_ADDR_BASE; address < _PERF_TEST_ADDR_BASE + size; address += _PERF_CACHELINE_SIZE_BYTE) {
+            __asm__ volatile ("ld a0, 0(%[addr])\n" :: [addr] "r"(address) : "a0");
+            __asm__ volatile ("ld a0, 8(%[addr])\n" :: [addr] "r"(address) : "a0");
+            __asm__ volatile ("ld a0, 16(%[addr])\n" :: [addr] "r"(address) : "a0");
+            __asm__ volatile ("ld a0, 24(%[addr])\n" :: [addr] "r"(address) : "a0");
+            __asm__ volatile ("ld a0, 32(%[addr])\n" :: [addr] "r"(address) : "a0");
+            __asm__ volatile ("ld a0, 40(%[addr])\n" :: [addr] "r"(address) : "a0");
+            __asm__ volatile ("ld a0, 48(%[addr])\n" :: [addr] "r"(address) : "a0");
+            __asm__ volatile ("ld a0, 56(%[addr])\n" :: [addr] "r"(address) : "a0");
+        }
+    }
+    _perf_end_timer();
+    // _perf_print_timer();
+    uint64_t total_access = size / _PERF_CACHELINE_SIZE_BYTE * 8 * iter;
+    float bandwidth = total_access * 8 * BYTE / (float)perf.cycle;
+    if (to_csv) {
+        printf("%ld, %f, %d, %ld, %ld\n", size, (float)perf.cycle / total_access, iter, total_access, perf.cycle);
+    } else {
+        printf("range %ldKB (%d iters) dcache linear (8Byte) read, latency %f, throughput %f B/cycle (%ld samples, %ld cycles), stride %dB\n", 
+            size/KB, iter, (float)perf.cycle / total_access, bandwidth, total_access, perf.cycle, 8
+        );
+    }
+    _perf_g_total_samples += total_access;
+    return bandwidth;
+}
+
+float test_l1_store_bandwidth(uint64_t size, int iter, int to_csv)
+{
+    // printf("stride %d linear access latency test\n", step);
+    // printf("range (B), read latency, iters, samples, cycles\n");
+    assert(size >= _PERF_CACHELINE_SIZE_BYTE);
+
+    // _perf_print_timer();
+    _perf_start_timer();
+    for (int i = 0; i < iter; i++) {
+        for (uint64_t address = _PERF_TEST_ADDR_BASE; address < _PERF_TEST_ADDR_BASE + size; address += _PERF_CACHELINE_SIZE_BYTE) {
+            __asm__ volatile ("sd a0, 0(%[addr])\n" :: [addr] "r"(address) : "a0");
+            __asm__ volatile ("sd a0, 8(%[addr])\n" :: [addr] "r"(address) : "a0");
+            __asm__ volatile ("sd a0, 16(%[addr])\n" :: [addr] "r"(address) : "a0");
+            __asm__ volatile ("sd a0, 24(%[addr])\n" :: [addr] "r"(address) : "a0");
+            __asm__ volatile ("sd a0, 32(%[addr])\n" :: [addr] "r"(address) : "a0");
+            __asm__ volatile ("sd a0, 40(%[addr])\n" :: [addr] "r"(address) : "a0");
+            __asm__ volatile ("sd a0, 48(%[addr])\n" :: [addr] "r"(address) : "a0");
+            __asm__ volatile ("sd a0, 56(%[addr])\n" :: [addr] "r"(address) : "a0");
+        }
+    }
+    _perf_end_timer();
+    // _perf_print_timer();
+    uint64_t total_access = size / _PERF_CACHELINE_SIZE_BYTE * 8 * iter;
+    float bandwidth = total_access * 8 * BYTE / (float)perf.cycle;
+    if (to_csv) {
+        printf("%ld, %f, %d, %ld, %ld\n", size, (float)perf.cycle / total_access, iter, total_access, perf.cycle);
+    } else {
+        printf("range %ldKB (%d iters) dcache linear (8Byte) store latency %f, throughput %f B/cycle (%ld samples, %ld cycles), stride %dB\n", 
+            size/KB, iter, (float)perf.cycle / total_access, bandwidth, total_access, perf.cycle, 8
+        );
+    }
+    _perf_g_total_samples += total_access;
+    return bandwidth;
+}
+
+float test_l1_store_wcb_bandwidth(uint64_t size, int iter, int to_csv)
+{
+    // printf("stride %d linear access latency test\n", step);
+    // printf("range (B), read latency, iters, samples, cycles\n");
+    assert(size >= _PERF_CACHELINE_SIZE_BYTE);
+
+    // _perf_print_timer();
+    _perf_start_timer();
+    for (int i = 0; i < iter; i++) {
+        for (uint64_t address = _PERF_TEST_ADDR_BASE; address < _PERF_TEST_ADDR_BASE + size; address += _PERF_CACHELINE_SIZE_BYTE) {
+            __asm__ volatile ("sd a0, 0(%[addr])\n" :: [addr] "r"(address) : "a0");
+        }
+    }
+    _perf_end_timer();
+    // _perf_print_timer();
+    uint64_t total_access = size / _PERF_CACHELINE_SIZE_BYTE * iter;
+    float bandwidth = total_access * _PERF_CACHELINE_SIZE_BYTE / (float)perf.cycle;
+    if (to_csv) {
+        printf("%ld, %f, %d, %ld, %ld\n", size, (float)perf.cycle / total_access, iter, total_access, perf.cycle);
+    } else {
+        printf("range %ldKB (%d iters) dcache linear (8Byte) store latency %f, throughput %f B/cycle (L1-L2 %f B/cycle) (%ld samples, %ld cycles), stride %dB\n", 
+            size/KB, iter, (float)perf.cycle / total_access, total_access * 8 * BYTE / (float)perf.cycle, total_access * _PERF_CACHELINE_SIZE_BYTE / (float)perf.cycle, total_access, perf.cycle, _PERF_CACHELINE_SIZE_BYTE
+        );
+    }
+    _perf_g_total_samples += total_access;
+    return bandwidth;
+}
\ No newline at end of file
diff --git a/apps/maprobe/bitutils.c b/apps/maprobe/bitutils.c
new file mode 100644
index 0000000000000000000000000000000000000000..cfad50f8ce951e25f393a6dd8befeaca3d189614
--- /dev/null
+++ b/apps/maprobe/bitutils.c
@@ -0,0 +1,36 @@
+#include "bitutils.h" 
+
+inline uint64_t _perf_get_bit_mask(int low, int high) {
+    assert(low < high);
+    assert(low >= 0);
+    assert(high < 63);
+    return ((1 << high) - 1) >> low << low;
+}
+
+inline uint64_t _perf_get_bits(uint64_t raw_data, int low, int high) {
+    assert(low < high);
+    assert(low >= 0);
+    assert(high < 63);
+    uint64_t mask = (1 << high) - 1;
+    return (raw_data & mask) >> low;
+}
+
+inline uint64_t _perf_get_bit(uint64_t raw_data, int position) {
+    assert(position >= 0);
+    assert(position <= 63);
+    return (raw_data >> position) & 1;
+}
+
+inline uint64_t _perf_set_bits(uint64_t raw_data, int low, int high, uint64_t new_value) {
+    assert(low < high);
+    assert(low >= 0);
+    assert(high < 63);
+    int mask = _perf_get_bit_mask(low, high);
+    return (raw_data & mask) | ((new_value << low) & mask);
+}
+
+inline uint64_t _perf_set_bit(uint64_t raw_data, int position, int new_value) {
+    assert(position >= 0);
+    assert(position <= 63);
+    return raw_data & ((new_value & 1) << position);
+}
diff --git a/apps/maprobe/common.c b/apps/maprobe/common.c
new file mode 100644
index 0000000000000000000000000000000000000000..2b5e7a961aa9d0e91ebfef3688c1c563a6544bf1
--- /dev/null
+++ b/apps/maprobe/common.c
@@ -0,0 +1,49 @@
+#include "maprobe.h"
+
+struct perf perf;
+uint64_t _perf_g_total_samples = 0;
+
+void _perf_start_timer()
+{
+#ifndef PERF_SIM
+    perf.instrcnt = csr_read(CSR_MINSTRET);
+    perf.cycle = csr_read(CSR_MCYCLE);
+#endif
+}
+
+void _perf_end_timer()
+{
+#ifndef PERF_SIM
+    perf.cycle = csr_read(CSR_MCYCLE) - perf.cycle;
+    perf.instrcnt = csr_read(CSR_MINSTRET) - perf.instrcnt;
+#endif
+}
+
+void _perf_print_timer()
+{
+    printf("cycle %d inst %d ipc %lf\n", perf.cycle, perf.instrcnt, (float)perf.instrcnt/perf.cycle);
+}
+
+void _perf_calibrate()
+{
+#ifndef PERF_SIM
+    // csr read delay
+    uint64_t cycle_1 = csr_read(CSR_MCYCLE);
+    uint64_t cycle_2 = csr_read(CSR_MCYCLE);
+    perf.csr_read_cycle = cycle_2-cycle_1;
+    printf("perf_calibrate: csr_read_cycle %d\n", perf.csr_read_cycle);
+
+    // csr read inst cost
+    uint64_t inst_1 = csr_read(CSR_MINSTRET);
+    uint64_t inst_2 = csr_read(CSR_MINSTRET);
+    perf.csr_read_ninst = inst_2-inst_1;
+    printf("perf_calibrate: csr_read_ninst %d\n", perf.csr_read_ninst);
+#else
+    printf("running in simulation environment, hpm read disabled\n");
+#endif
+}
+
+void _perf_blackhole(uint64_t value)
+{
+    *(uint64_t*) _PERF_BLACKHOLE = value;
+}
diff --git a/apps/maprobe/include/bitutils.h b/apps/maprobe/include/bitutils.h
new file mode 100644
index 0000000000000000000000000000000000000000..f3b7f54be8711d5558f8905af664d85c3012972f
--- /dev/null
+++ b/apps/maprobe/include/bitutils.h
@@ -0,0 +1,14 @@
+// bit op utils for perf
+
+#ifndef PROBE_BITUTILS_H
+#define PROBE_BITUTILS_H
+
+#include <klib.h>
+
+extern uint64_t _perf_get_bit_mask(int low, int high);
+extern uint64_t _perf_get_bits(uint64_t raw_data, int low, int high);
+extern uint64_t _perf_get_bit(uint64_t raw_data, int position);
+extern uint64_t _perf_set_bits(uint64_t raw_data, int low, int high, uint64_t new_value);
+extern uint64_t _perf_set_bit(uint64_t raw_data, int position, int new_value);
+
+#endif
\ No newline at end of file
diff --git a/apps/maprobe/include/maprobe.h b/apps/maprobe/include/maprobe.h
index ace1682c2e3f023e9c2dd7cef841e3ed2b43064b..85695a8bb515f7b0a03ea1a7ea760a5ee59f9f2f 100644
--- a/apps/maprobe/include/maprobe.h
+++ b/apps/maprobe/include/maprobe.h
@@ -5,6 +5,11 @@
 
 #include <klib.h>
 #include <csr.h>
+#include "bitutils.h"
+#include "resultmat.h"
+
+// config
+// #define PERF_SIM // probe run in simulatior, diaable perf counters
 
 // perf const
 #define BYTE (1)
@@ -13,15 +18,29 @@
 #define GB (1024*MB)
 
 // platform dependent const
-// #define _PERF_TEST_ADDR_BASE 0x80400000
-#define _PERF_TEST_ADDR_BASE 0x2000400000
+#ifndef _PERF_TEST_ADDR_BASE
+#define _PERF_TEST_ADDR_BASE 0x80400000
+// #define _PERF_TEST_ADDR_BASE 0x2000400000
+#endif
 #define _PERF_CACHELINE_SIZE_BYTE (64 * BYTE)
-#define _PERF_L1_NOALIAS_SIZE_BYTE (32 * KB)
-#define _PERF_L1_SIZE_BYTE (128 * KB)
-#define _PERF_L2_SIZE_BYTE (512 * KB)
-#define _PERF_L3_SIZE_BYTE (2 * MB)
-#define _PERF_L1_NUM_WAYS 8
-#define _PERF_SET_SIZE_BYTE (_PERF_L1_SIZE_BYTE / _PERF_L1_NUM_WAYS)
+#define _PERF_PAGE_SIZE_BYTE (4 * KB)
+#define _PERF_L1_NOALIAS_SIZE_BYTE (16 * KB)
+#define _PERF_L1_SIZE_BYTE (64 * KB)
+#define _PERF_L2_SIZE_BYTE (1 * MB)
+#define _PERF_L3_SIZE_BYTE (6 * MB)
+#define _PERF_MEM_SIZE_BYTE (1024 * MB)
+#define _PERF_L1_NUM_WAYS 4
+#define _PERF_L1_NUM_SETS 256
+#define _PERF_L2_NUM_WAYS 8
+#define _PERF_L2_NUM_SLICES 4
+#define _PERF_L2_NUM_SETS 512
+
+#define _PERF_ADDR_STRIDE_L1_SAME_BANK _PERF_CACHELINE_SIZE_BYTE
+#define _PERF_ADDR_STRIDE_L1_SAME_SET (_PERF_L1_NUM_SETS * _PERF_CACHELINE_SIZE_BYTE)
+#define _PERF_ADDR_STRIDE_L2_SAME_SLICE (_PERF_L2_NUM_SLICES * _PERF_CACHELINE_SIZE_BYTE)
+#define _PERF_ADDR_STRIDE_L1_SAME_SET (_PERF_L1_NUM_SETS * _PERF_CACHELINE_SIZE_BYTE)
+#define _PERF_ADDR_STRIDE_L2_SAME_SET (_PERF_L2_NUM_SLICES * _PERF_L2_NUM_SETS * _PERF_CACHELINE_SIZE_BYTE)
+#define _PERF_ADDR_STRIDE_NEXT_PAGE (_PERF_PAGE_SIZE_BYTE)
 
 // probe const
 #define _PERF_BLACKHOLE _PERF_TEST_ADDR_BASE
@@ -29,128 +48,51 @@
 struct perf
 {
     // const to be calibrated at run time
-    uint64_t csr_read_cycle; //# of cycles to read mcycle
+    uint64_t csr_read_cycle; // # of cycles to read mcycle
     uint64_t csr_read_ninst; // # of inst needed to read minstret
 
     // timer
     uint64_t cycle;
     uint64_t instrcnt;
-} perf;
-
-void _perf_start_timer()
-{
-    perf.cycle = csr_read(CSR_MCYCLE);
-    perf.instrcnt = csr_read(CSR_MINSTRET);
-}
-
-void _perf_end_timer()
-{
-    perf.cycle = csr_read(CSR_MCYCLE) - perf.cycle;
-    perf.instrcnt = csr_read(CSR_MINSTRET) - perf.instrcnt;
-}
-
-void _perf_print_timer()
-{
-    printf("cycle %d inst %d ipc %lf\n", perf.cycle, perf.instrcnt, (float)perf.instrcnt/perf.cycle);
-}
-
-void _perf_calibrate()
-{
-    // csr read delay
-    uint64_t cycle_1 = csr_read(CSR_MCYCLE);
-    uint64_t cycle_2 = csr_read(CSR_MCYCLE);
-    perf.csr_read_cycle = cycle_2-cycle_1;
-    printf("perf_calibrate: csr_read_cycle %d\n", perf.csr_read_cycle);
-
-    // csr read inst cost
-    uint64_t inst_1 = csr_read(CSR_MINSTRET);
-    uint64_t inst_2 = csr_read(CSR_MINSTRET);
-    perf.csr_read_ninst = inst_2-inst_1;
-    printf("perf_calibrate: csr_read_ninst %d\n", perf.csr_read_ninst);
-}
-
-void _perf_blackhole(uint64_t value)
-{
-    *(uint64_t*) _PERF_BLACKHOLE = value;
-}
-
-uint64_t setup_latency_test_linklist(uint64_t base_addr, uint64_t end_addr, uint64_t step)
-{
-    uint64_t num_valid_node = 0;
-    assert(step % 8 == 0);
-    assert(step >= 8);
-    for (uint64_t cur_addr = base_addr; cur_addr < end_addr;) {
-        uint64_t next_addr = cur_addr + step;
-        *((uint64_t*)cur_addr) = next_addr;
-        cur_addr = next_addr;
-        num_valid_node++;
-    }
-    return num_valid_node;
-}
-
-uint64_t read_latency_test_linklist(uint64_t base_addr, uint64_t num_valid_node)
-{
-    uint64_t cur_addr = base_addr;
-    for (int i = 0; i < num_valid_node; i++) {
-        cur_addr = (*(uint64_t*)cur_addr);
-    }
-    return cur_addr;
-}
-
-void warmup(uint64_t base_addr, uint64_t end_addr)
-{
-    setup_latency_test_linklist(base_addr, end_addr, _PERF_CACHELINE_SIZE_BYTE);
-}
-
-void test_latency(uint64_t size, int iter)
-{
-    volatile uint64_t result = 0; // make sure compiler will not opt read_latency_test_linklist
-    printf("range 0x%xB (%d iters) latency test\n", size, iter);
-    _perf_start_timer();
-    uint64_t nnode = setup_latency_test_linklist(_PERF_TEST_ADDR_BASE, _PERF_TEST_ADDR_BASE + size, _PERF_CACHELINE_SIZE_BYTE);
-    _perf_end_timer();
-    uint64_t total_node = nnode * iter;
-    // _perf_print_timer();
-
-    _perf_start_timer();
-    for (int i = 0; i < iter; i++) {
-        result += read_latency_test_linklist(_PERF_TEST_ADDR_BASE, nnode);
-    }
-    _perf_end_timer();
-    // _perf_print_timer();
-    printf("range 0x%xB (%d intrs) read latency %f (%d samples)\n", size, iter, (float)perf.cycle / total_node, total_node);
-
-    _perf_blackhole(result);
-}
-
-void test_mem_throughput(uint64_t iter)
-{
-    uint64_t remain = iter;
-    uint64_t result = 0;
-    uint64_t access_addr = _PERF_TEST_ADDR_BASE;
-    _perf_start_timer();
-    while (remain--) {
-        result += *(uint64_t*) access_addr;
-        access_addr += _PERF_CACHELINE_SIZE_BYTE;
-    }
-    _perf_end_timer();
-    *(uint64_t*) _PERF_BLACKHOLE = result;
-    printf("mem band width %f B/cycle (%d samples)\n", (float)iter * _PERF_CACHELINE_SIZE_BYTE / perf.cycle, iter);
-}
-
-void test_mem_throughput_same_set(uint64_t iter)
-{
-    uint64_t remain = iter;
-    uint64_t result = 0;
-    uint64_t access_addr = _PERF_TEST_ADDR_BASE;
-    _perf_start_timer();
-    while (remain--) {
-        result += *(uint64_t*) access_addr;
-        access_addr += _PERF_SET_SIZE_BYTE;
-    }
-    _perf_end_timer();
-    *(uint64_t*) _PERF_BLACKHOLE = result;
-    printf("mem band width %f B/cycle (%d samples)\n", (float)iter * _PERF_CACHELINE_SIZE_BYTE / perf.cycle, iter);
-}
+};
+extern struct perf perf;
+
+extern uint64_t _perf_g_total_samples;
+
+// common perf tools
+extern void _perf_start_timer();
+extern void _perf_end_timer();
+extern void _perf_print_timer();
+extern void _perf_calibrate();
+extern void _perf_blackhole(uint64_t value);
+
+// latency test
+extern uint64_t setup_pointer_tracing_linklist(uint64_t base_addr, uint64_t end_addr, uint64_t step);
+extern uint64_t read_pointer_tracing_linklist(uint64_t base_addr, uint64_t num_valid_node);
+extern void latency_test_warmup(uint64_t base_addr, uint64_t end_addr);
+extern float test_pointer_tracing_latency(uint64_t size, int step, int iter, int to_csv);
+extern float test_linear_access_latency(uint64_t size, uint64_t step, int iter, int to_csv);
+extern float test_linear_access_latency_simple(uint64_t size, uint64_t step, int iter, int to_csv);
+extern float test_linear_access_latency_batch8(uint64_t size, uint64_t step, int iter, int to_csv);
+extern float test_random_access_latency(uint64_t num_access, uint64_t test_range, uint64_t test_align, int pregen_addr, int iter, int to_csv);
+extern float test_same_address_load_latency(int iter, int to_csv);
+extern float test_read_after_write_latency(int iter, int to_csv);
+extern float test_linear_write_latency(uint64_t size, uint64_t step, int iter, int to_csv);
+
+
+// bandwidth test
+extern float test_l1_load_bandwidth(uint64_t size, int iter, int to_csv);
+extern float test_l1_store_bandwidth(uint64_t size, int iter, int to_csv);
+extern float test_l1_store_wcb_bandwidth(uint64_t size, int iter, int to_csv);
+
+// key parameter matrix generate
+void generate_linear_access_latency_matrix(uint64_t step);
+void generate_pointer_tracing_latency_matrix(uint64_t step);
+void generate_random_access_latency_matrix();
+void generate_replacement_test_matrix();
+
+// legacy test
+extern void legacy_test_mem_throughput(uint64_t iter);
+extern void legacy_test_mem_throughput_same_set(uint64_t iter);
 
 #endif
\ No newline at end of file
diff --git a/apps/maprobe/include/resultmat.h b/apps/maprobe/include/resultmat.h
new file mode 100644
index 0000000000000000000000000000000000000000..629aa49b469a23e85d8a1663a9406c81f9b55462
--- /dev/null
+++ b/apps/maprobe/include/resultmat.h
@@ -0,0 +1,37 @@
+#ifndef PROBE_RESULT_MATRIX_H
+#define PROBE_RESULT_MATRIX_H
+
+#include <klib.h>
+
+struct result_matrix_meta {
+    char* name;
+    char* row_name;
+    char* column_name;
+    int row_size;
+    int column_size;
+    void* result_array;
+    void* column_array;
+    void* row_array;
+};
+
+void print_float_result_matrix(struct result_matrix_meta* meta);
+void matrix_print_example();
+
+#define FOR(v,end) for (int v = 0; v < end; v++)
+#define CONCAT(a,b) a##b
+#define TOSTR(a) #a
+#define DEFINE_FLOAT_RESULT_MATRIX(matrix_name, rowname, rowsize, columnname, columnsize) \
+    struct result_matrix_meta CONCAT(matrix_name,_matrix_meta); \
+    float CONCAT(matrix_name,_result_array)[rowsize][columnsize] = {0}; \
+    int CONCAT(matrix_name,_column_array)[columnsize] = {0}; \
+    int CONCAT(matrix_name,_row_array)[rowsize] = {0}; \
+    CONCAT(matrix_name,_matrix_meta).name = TOSTR(matrix_name); \
+    CONCAT(matrix_name,_matrix_meta).column_name = TOSTR(columnname); \
+    CONCAT(matrix_name,_matrix_meta).row_name = TOSTR(rowname); \
+    CONCAT(matrix_name,_matrix_meta).column_size = columnsize; \
+    CONCAT(matrix_name,_matrix_meta).row_size = rowsize; \
+    CONCAT(matrix_name,_matrix_meta).result_array = CONCAT(matrix_name,_result_array); \
+    CONCAT(matrix_name,_matrix_meta).column_array = CONCAT(matrix_name,_column_array); \
+    CONCAT(matrix_name,_matrix_meta).row_array = CONCAT(matrix_name,_row_array);
+
+#endif
\ No newline at end of file
diff --git a/apps/maprobe/latency-test.c b/apps/maprobe/latency-test.c
new file mode 100644
index 0000000000000000000000000000000000000000..f72cd949ef8e0f4aa350201759cd97a536c6d27a
--- /dev/null
+++ b/apps/maprobe/latency-test.c
@@ -0,0 +1,460 @@
+#include "maprobe.h"
+
+inline uint64_t generate_rand_address(uint64_t base_addr, uint64_t end_addr, uint64_t align) {
+    return (rand() % (end_addr - base_addr) + base_addr) / align * align;
+}
+
+void generate_rand_address_array(uint64_t* dest, uint64_t base_addr, uint64_t end_addr, uint64_t align, int number) {
+    for (int i = 0; i < number; i++) {
+        *(dest + i) = generate_rand_address(base_addr, end_addr, align);
+    }
+}
+
+uint64_t generate_pointer_tracing_address(uint64_t base_addr, uint64_t end_addr, uint64_t step) {
+    return setup_pointer_tracing_linklist(base_addr, end_addr, step);
+}
+
+uint64_t setup_pointer_tracing_linklist(uint64_t base_addr, uint64_t end_addr, uint64_t step)
+{
+    uint64_t num_valid_node = 0;
+    assert(step % 8 == 0);
+    assert(step >= 8);
+    for (uint64_t cur_addr = base_addr; cur_addr < end_addr;) {
+        uint64_t next_addr = cur_addr + step;
+        *((uint64_t*)cur_addr) = next_addr;
+        cur_addr = next_addr;
+        num_valid_node++;
+    }
+    return num_valid_node;
+}
+
+uint64_t read_pointer_tracing_linklist(uint64_t base_addr, uint64_t num_valid_node)
+{
+    uint64_t cur_addr = base_addr;
+    for (int i = 0; i < num_valid_node; i++) {
+        cur_addr = (*(uint64_t*)cur_addr);
+    }
+    return cur_addr;
+}
+
+void latency_test_warmup(uint64_t base_addr, uint64_t end_addr)
+{
+    setup_pointer_tracing_linklist(base_addr, end_addr, _PERF_CACHELINE_SIZE_BYTE);
+}
+
+float test_pointer_tracing_latency(uint64_t size, int step, int iter, int to_csv)
+{
+    // printf("pointer tracing latency test\n");
+    // printf("range (B), read latency, iters, samples, cycles\n");
+    register uint64_t result = 0; // make sure compiler will not opt read_pointer_tracing_linklist
+    _perf_start_timer();
+    uint64_t nnode = setup_pointer_tracing_linklist(_PERF_TEST_ADDR_BASE, _PERF_TEST_ADDR_BASE + size, step);
+    _perf_end_timer();
+    uint64_t total_node = nnode * iter;
+    // _perf_print_timer();
+
+    _perf_start_timer();
+    for (int i = 0; i < iter; i++) {
+        result += read_pointer_tracing_linklist(_PERF_TEST_ADDR_BASE, nnode);
+    }
+    _perf_end_timer();
+    // _perf_print_timer();
+    float acpa = (float)perf.cycle / total_node; // average cycle per access
+    if (to_csv) {
+        printf("%ld, %f, %d, %ld, %ld\n", size, acpa, iter, total_node, perf.cycle);
+    } else {
+        printf("range %ldKB (%d iters) pointer tracing read latency %f, throughput %f B/cycle (%ld samples, %ld cycles)\n",
+            size/KB, iter, acpa, total_node * 8 * BYTE / (float)perf.cycle, total_node, perf.cycle
+        );
+    }
+    _perf_g_total_samples += total_node;
+    _perf_blackhole(result);
+    return acpa;
+}
+
+float test_same_address_load_latency(int iter, int to_csv)
+{
+    // printf("same address load latency test\n", step);
+    // printf("range (B), read latency, iters, samples, cycles\n");
+    register uint64_t result = 0; 
+    // _perf_print_timer();
+
+    _perf_start_timer();
+    uint64_t address = _PERF_TEST_ADDR_BASE;
+    for (int i = 0; i < iter; i++) {
+        result += *((volatile uint64_t*) (address));
+    }
+    _perf_end_timer();
+    // _perf_print_timer();
+    uint64_t total_access = iter;
+    float acpa = (float)perf.cycle / total_access; // average cycle per access
+    if (to_csv) {
+        printf("%ld, %f, %d, %ld, %ld\n", 0, acpa, iter, total_access, perf.cycle);
+    } else {
+        printf("same address read latency %f, throughput %f B/cycle (%ld samples, %ld cycles)\n", 
+            acpa, total_access * 8 * BYTE / (float)perf.cycle, total_access, perf.cycle
+        );
+    }
+    _perf_g_total_samples += total_access;
+    _perf_blackhole(result);
+    return acpa;
+}
+
+float test_read_after_write_latency(int iter, int to_csv)
+{
+    // printf("same address store-load latency test\n", step);
+    // printf("range (B), read latency, iters, samples, cycles\n");
+    volatile uint64_t result = 0; // make sure compiler will store data to memory
+    // _perf_print_timer();
+
+    _perf_start_timer();
+    for (int i = 0; i < iter; i++) {
+        uint64_t address = _PERF_TEST_ADDR_BASE;
+        result += *((uint64_t*) (address));
+        address += sizeof(uint64_t);
+    }
+    _perf_end_timer();
+    // _perf_print_timer();
+    uint64_t total_access = iter;
+    float acpa = (float)perf.cycle / total_access; // average cycle per access
+    if (to_csv) {
+        printf("%ld, %f, %d, %ld, %ld\n", 0, acpa, iter, total_access, perf.cycle);
+    } else {
+        printf("read after write latency %f, throughput %f B/cycle (%ld samples, %ld cycles)\n", 
+            acpa, total_access * 8 * BYTE / (float)perf.cycle, total_access, perf.cycle
+        );
+    }
+    _perf_g_total_samples += total_access;
+    _perf_blackhole(result);
+    return acpa;
+}
+
+float test_linear_access_latency_simple(uint64_t size, uint64_t step, int iter, int to_csv)
+{
+    // printf("stride %d linear access latency test\n", step);
+    // printf("range (B), read latency, iters, samples, cycles\n");
+    register uint64_t result = 0; 
+    uint64_t num_access = size / step;
+    // _perf_print_timer();
+
+    _perf_start_timer();
+    for (int i = 0; i < iter; i++) {
+        uint64_t address = _PERF_TEST_ADDR_BASE;
+        for (int j = 0; j < num_access; j++) {
+            result += *((volatile uint64_t*) (address));
+            address += step;
+        }
+    }
+    _perf_end_timer();
+    // _perf_print_timer();
+    uint64_t total_access = num_access * iter;
+    float acpa = (float)perf.cycle / total_access; // average cycle per access
+    if (to_csv) {
+        printf("%ld, %f, %d, %ld, %ld\n", size, acpa, iter, total_access, perf.cycle);
+    } else {
+        printf("range %ldKB (%d iters) simple linear read latency %f, throughput %f B/cycle (%ld samples, %ld cycles), stride %dB\n", 
+            size/KB, iter, acpa, total_access * 8 * BYTE / (float)perf.cycle, total_access, perf.cycle, step
+        );
+    }
+    _perf_g_total_samples += total_access;
+    _perf_blackhole(result);
+    return acpa;
+}
+
+float test_linear_access_latency_batch8(uint64_t size, uint64_t step, int iter, int to_csv)
+{
+    // printf("stride %d linear access latency test\n", step);
+    // printf("range (B), read latency, iters, samples, cycles\n");
+    uint64_t num_access = size / step;
+    num_access += num_access % 8 ? 8 - num_access % 8 : 0;
+    assert(num_access >= 8);
+    // prepare access offset
+    register uint64_t address_offset_1 = step * 1;
+    register uint64_t address_offset_2 = step * 2;
+    register uint64_t address_offset_3 = step * 3;
+    register uint64_t address_offset_4 = step * 4;
+    register uint64_t address_offset_5 = step * 5;
+    register uint64_t address_offset_6 = step * 6;
+    register uint64_t address_offset_7 = step * 7;
+    register uint64_t address_offset_8 = step * 8;
+
+    // _perf_print_timer();
+    _perf_start_timer();
+    for (int i = 0; i < iter; i++) {
+        uint64_t address = _PERF_TEST_ADDR_BASE;
+        for (int j = 0; j < num_access; j += 8) {
+            __asm__ volatile (
+                "mv a1, %[addr]\n"
+                "add a2, %[addr], %[offset1]\n"
+                "add a3, %[addr], %[offset2]\n"
+                "add a4, %[addr], %[offset3]\n"
+                "add a5, %[addr], %[offset4]\n"
+                "add t0, %[addr], %[offset5]\n"
+                "add t1, %[addr], %[offset6]\n"
+                "add t2, %[addr], %[offset7]\n"
+                "ld a0, 0(a1)\n" 
+                "ld a0, 0(a2)\n" 
+                "ld a0, 0(a3)\n" 
+                "ld a0, 0(a4)\n" 
+                "ld a0, 0(a5)\n" 
+                "ld a0, 0(t0)\n" 
+                "ld a0, 0(t1)\n" 
+                "ld a0, 0(t2)\n" 
+                :: 
+                [offset1] "r"(address_offset_1),
+                [offset2] "r"(address_offset_2),
+                [offset3] "r"(address_offset_3),
+                [offset4] "r"(address_offset_4),
+                [offset5] "r"(address_offset_5),
+                [offset6] "r"(address_offset_6),
+                [offset7] "r"(address_offset_7),
+                [addr] "r"(address) 
+                : "a0", "a1", "a2", "a3", "a4", "a5", "t0", "t1", "t2", "t3"
+            );
+            address += address_offset_8;
+            // register uint64_t access_addr_0 = address + address_offset_0;
+            // register uint64_t access_addr_1 = address + address_offset_1;
+            // register uint64_t access_addr_2 = address + address_offset_2;
+            // register uint64_t access_addr_3 = address + address_offset_3;
+            // register uint64_t access_addr_4 = address + address_offset_4;
+            // register uint64_t access_addr_5 = address + address_offset_5;
+            // register uint64_t access_addr_6 = address + address_offset_6;
+            // register uint64_t access_addr_7 = address + address_offset_7;
+            // address += address_offset_8;
+            // __asm__ volatile ("ld a0, 0(%[addr])\n" :: [addr] "r"(access_addr_0) : "a0");
+            // __asm__ volatile ("ld a0, 0(%[addr])\n" :: [addr] "r"(access_addr_1) : "a0");
+            // __asm__ volatile ("ld a0, 0(%[addr])\n" :: [addr] "r"(access_addr_2) : "a0");
+            // __asm__ volatile ("ld a0, 0(%[addr])\n" :: [addr] "r"(access_addr_3) : "a0");
+            // __asm__ volatile ("ld a0, 0(%[addr])\n" :: [addr] "r"(access_addr_4) : "a0");
+            // __asm__ volatile ("ld a0, 0(%[addr])\n" :: [addr] "r"(access_addr_5) : "a0");
+            // __asm__ volatile ("ld a0, 0(%[addr])\n" :: [addr] "r"(access_addr_6) : "a0");
+            // __asm__ volatile ("ld a0, 0(%[addr])\n" :: [addr] "r"(access_addr_7) : "a0");
+        }
+    }
+    _perf_end_timer();
+    // _perf_print_timer();
+    uint64_t total_access = num_access * iter;
+    float acpa = (float)perf.cycle / total_access; // average cycle per access
+    if (to_csv) {
+        printf("%ld, %f, %d, %ld, %ld\n", size, acpa, iter, total_access, perf.cycle);
+    } else {
+        printf("range %ldKB (%d iters) batch(8) linear read latency %f, throughput %f B/cycle (%ld samples, %ld cycles), stride %dB\n", 
+            size/KB, iter, acpa, total_access * 8 * BYTE / (float)perf.cycle, total_access, perf.cycle, step
+        );
+    }
+    _perf_g_total_samples += total_access;
+    return acpa;
+}
+
+float test_linear_write_latency_batch8(uint64_t size, uint64_t step, int iter, int to_csv)
+{
+    // printf("stride %d linear access latency test\n", step);
+    // printf("range (B), read latency, iters, samples, cycles\n");
+    uint64_t num_access = size / step;
+    num_access += num_access % 8 ? 8 - num_access % 8 : 0;
+    assert(num_access >= 8);
+    // prepare access offset
+    uint64_t address_offset_0 = 0;
+    register uint64_t address_offset_1 = step * 1;
+    register uint64_t address_offset_2 = step * 2;
+    register uint64_t address_offset_3 = step * 3;
+    register uint64_t address_offset_4 = step * 4;
+    register uint64_t address_offset_5 = step * 5;
+    register uint64_t address_offset_6 = step * 6;
+    register uint64_t address_offset_7 = step * 7;
+    register uint64_t address_offset_8 = step * 8;
+
+    // _perf_print_timer();
+    _perf_start_timer();
+    for (int i = 0; i < iter; i++) {
+        uint64_t address = _PERF_TEST_ADDR_BASE;
+        for (int j = 0; j < num_access; j += 8) {
+            register uint64_t access_addr_0 = address + address_offset_0;
+            register uint64_t access_addr_1 = address + address_offset_1;
+            register uint64_t access_addr_2 = address + address_offset_2;
+            register uint64_t access_addr_3 = address + address_offset_3;
+            register uint64_t access_addr_4 = address + address_offset_4;
+            register uint64_t access_addr_5 = address + address_offset_5;
+            register uint64_t access_addr_6 = address + address_offset_6;
+            register uint64_t access_addr_7 = address + address_offset_7;
+            address += address_offset_8;
+            __asm__ volatile ("sd a0, 0(%[addr])\n" :: [addr] "r"(access_addr_0) : "a0");
+            __asm__ volatile ("sd a0, 0(%[addr])\n" :: [addr] "r"(access_addr_1) : "a0");
+            __asm__ volatile ("sd a0, 0(%[addr])\n" :: [addr] "r"(access_addr_2) : "a0");
+            __asm__ volatile ("sd a0, 0(%[addr])\n" :: [addr] "r"(access_addr_3) : "a0");
+            __asm__ volatile ("sd a0, 0(%[addr])\n" :: [addr] "r"(access_addr_4) : "a0");
+            __asm__ volatile ("sd a0, 0(%[addr])\n" :: [addr] "r"(access_addr_5) : "a0");
+            __asm__ volatile ("sd a0, 0(%[addr])\n" :: [addr] "r"(access_addr_6) : "a0");
+            __asm__ volatile ("sd a0, 0(%[addr])\n" :: [addr] "r"(access_addr_7) : "a0");
+        }
+    }
+    _perf_end_timer();
+    // _perf_print_timer();
+    uint64_t total_access = num_access * iter;
+    float acpa = (float)perf.cycle / total_access; // average cycle per access
+    if (to_csv) {
+        printf("%ld, %f, %d, %ld, %ld\n", size, acpa, iter, total_access, perf.cycle);
+    } else {
+        printf("range %ldKB (%d iters) batch(8) linear write latency %f, throughput %f B/cycle (%ld samples, %ld cycles), stride %dB\n", 
+            size/KB, iter, acpa, total_access * 8 * BYTE / (float)perf.cycle, total_access, perf.cycle, step
+        );
+    }
+    _perf_g_total_samples += total_access;
+    return acpa;
+}
+
+float test_linear_access_latency(uint64_t size, uint64_t step, int iter, int to_csv)
+{
+    return test_linear_access_latency_batch8(size, step, iter, to_csv);
+}
+
+float test_linear_write_latency(uint64_t size, uint64_t step, int iter, int to_csv)
+{
+    return test_linear_write_latency_batch8(size, step, iter, to_csv);
+}
+
+float test_random_access_latency(uint64_t num_access, uint64_t test_range, uint64_t test_align, int pregen_addr, int iter, int to_csv)
+{
+    // printf("align %d random access (cache line) latency test, %s\n",
+    //     test_align, pregen_addr ? "use pregen addr array" : "gen rand addr at run time"
+    // );
+    // printf("range (B), read latency, iters, samples, cycles\n");
+    register uint64_t result = 0; 
+    // _perf_print_timer();
+
+    uint64_t total_access = num_access * iter;
+    if (test_range > total_access*8*_PERF_CACHELINE_SIZE_BYTE) {
+        printf("total access size %ldKB less than test range %ldKB, ignored\n",
+            total_access*8*_PERF_CACHELINE_SIZE_BYTE/KB,
+            test_range/KB
+        );
+        return 0;
+    }
+
+    // alloc memory for random access addr array and data
+    assert(test_align >= 8 * BYTE);
+    // assert(size >= test_align);
+    // uint64_t num_access = size / test_align;
+    if (pregen_addr) {
+        uint64_t test_array_base_addr = _PERF_TEST_ADDR_BASE + num_access * sizeof(uint64_t*);
+        uint64_t address_array_base_addr = _PERF_TEST_ADDR_BASE;
+        generate_rand_address_array((uint64_t*)address_array_base_addr, test_array_base_addr, test_array_base_addr + test_range, test_align, num_access);
+        _perf_start_timer();
+        for (int i = 0; i < iter; i++) {
+            for (int j = 0; j < num_access; j++) {
+                result += *((uint64_t*) (address_array_base_addr + j * sizeof(uint64_t*)));
+            }
+        }
+        _perf_end_timer();
+    } else {
+        _perf_start_timer();
+        for (int i = 0; i < iter; i++) {
+            for (int j = 0; j < num_access; j++) {
+                result += *((uint64_t*) (generate_rand_address(_PERF_TEST_ADDR_BASE, _PERF_TEST_ADDR_BASE + test_range, test_align)));
+            }
+        }
+        _perf_end_timer();
+    }
+    // _perf_print_timer();
+    float acpa = (float)perf.cycle / total_access; // average cycle per access
+    if (to_csv) {
+        printf("%ld, %f, %d, %ld, %ld\n", test_range, acpa, iter, total_access, perf.cycle);
+    } else {
+        printf("range %ldKB, access %ldKB (cover %ldKB) (%d iters) random read latency %f, throughput %f B/cycle (%ld samples, %ld cycles), align %ldB, %s\n", 
+            test_range/KB, total_access*8*BYTE/KB,  total_access*8*_PERF_CACHELINE_SIZE_BYTE/KB, iter, acpa, total_access * 8 * BYTE / (float)perf.cycle, total_access, perf.cycle, test_align,
+            pregen_addr ? "pregen addr" : "runtime addr"
+        );
+    }
+    _perf_g_total_samples += total_access;
+    _perf_blackhole(result);
+    return acpa;
+}
+
+void legacy_test_mem_throughput(uint64_t iter)
+{
+    uint64_t remain = iter;
+    uint64_t result = 0;
+    uint64_t access_addr = _PERF_TEST_ADDR_BASE;
+    _perf_start_timer();
+    while (remain--) {
+        result += *(uint64_t*) access_addr;
+        access_addr += _PERF_CACHELINE_SIZE_BYTE;
+    }
+    _perf_end_timer();
+    *(uint64_t*) _PERF_BLACKHOLE = result;
+    printf("mem band width %f B/cycle (%d samples)\n", (float)iter * _PERF_CACHELINE_SIZE_BYTE / perf.cycle, iter);
+}
+
+void legacy_test_mem_throughput_same_set(uint64_t iter)
+{
+    uint64_t remain = iter;
+    uint64_t result = 0;
+    uint64_t access_addr = _PERF_TEST_ADDR_BASE;
+    _perf_start_timer();
+    while (remain--) {
+        result += *(uint64_t*) access_addr;
+        access_addr += _PERF_ADDR_STRIDE_L1_SAME_SET;
+    }
+    _perf_end_timer();
+    *(uint64_t*) _PERF_BLACKHOLE = result;
+    printf("mem band width %f B/cycle (%d samples)\n", (float)iter * _PERF_CACHELINE_SIZE_BYTE / perf.cycle, iter);
+}
+
+void generate_linear_access_latency_matrix(uint64_t step)
+{
+    // step can be _PERF_CACHELINE_SIZE_BYTE or 8*BYTE
+#define LINEAR_ACCESS_MATRIX_SIZE_MAX_POW2_KB 14
+    // LINEAR_ACCESS_MATRIX_SIZE_MAX_POW2_KB 14: 14 cases in total, from 1KB to 8MB
+    DEFINE_FLOAT_RESULT_MATRIX(linear_access_latency,size_kb_pow2,LINEAR_ACCESS_MATRIX_SIZE_MAX_POW2_KB,iter,3);
+    FOR(x,LINEAR_ACCESS_MATRIX_SIZE_MAX_POW2_KB) { linear_access_latency_row_array[x] = x; }
+    FOR(x,3) { linear_access_latency_column_array[x] = x; }
+    for (int i = 0; i < LINEAR_ACCESS_MATRIX_SIZE_MAX_POW2_KB; i++) {
+        int warm_up_iter = i < 6 ? 4 : 1;
+        int test_iter = i < 6 ? 4 : 2;
+        linear_access_latency_result_array[i][0] = test_linear_access_latency((1<<i)*KB,step,warm_up_iter,0); //warmup
+        linear_access_latency_result_array[i][1] = test_linear_access_latency((1<<i)*KB,step,test_iter,0); //test
+        linear_access_latency_result_array[i][2] = test_linear_access_latency((1<<i)*KB,step,test_iter,0); //test
+    }
+    printf("[test step %ld]\n", step);
+    print_float_result_matrix(&linear_access_latency_matrix_meta);
+}
+
+void generate_pointer_tracing_latency_matrix(uint64_t step)
+{
+    // step can be _PERF_CACHELINE_SIZE_BYTE or 8*BYTE
+#define POINTER_CHASING_MATRIX_SIZE_MAX_POW2_KB 14
+    // POINTER_CHASING_MATRIX_SIZE_MAX_POW2_KB 14: 14 cases in total, from 1KB to 8MB
+    DEFINE_FLOAT_RESULT_MATRIX(pointer_tracing_latency,size_kb_pow2,POINTER_CHASING_MATRIX_SIZE_MAX_POW2_KB,iter,3);
+    FOR(x,POINTER_CHASING_MATRIX_SIZE_MAX_POW2_KB) { pointer_tracing_latency_row_array[x] = x; }
+    FOR(x,3) { pointer_tracing_latency_column_array[x] = x; }
+    for (int i = 0; i < POINTER_CHASING_MATRIX_SIZE_MAX_POW2_KB; i++) {
+        int warm_up_iter = i < 6 ? 4 : 1;
+        int test_iter = i < 6 ? 4 : 2;
+        pointer_tracing_latency_result_array[i][0] = test_pointer_tracing_latency((1<<i)*KB,step,warm_up_iter,0); //warmup
+        pointer_tracing_latency_result_array[i][1] = test_pointer_tracing_latency((1<<i)*KB,step,test_iter,0); //test
+        pointer_tracing_latency_result_array[i][2] = test_pointer_tracing_latency((1<<i)*KB,step,test_iter,0); //test
+    }
+    printf("[test step %ld]\n", step);
+    print_float_result_matrix(&pointer_tracing_latency_matrix_meta);
+}
+
+void generate_random_access_latency_matrix()
+{
+#define RANDOM_ACCESS_MATRIX_SIZE_MAX_POW2_KB 6
+    // RANDOM_ACCESS_MATRIX_SIZE_MAX_POW2_KB 10: from 1KB to 512KB
+#define RANDOM_ACCESS_MATRIX_ACCESS_MAX_POW2_KB 6
+    // RANDOM_ACCESS_MATRIX_ACCESS_MAX_POW2_KB 10: from 1KB to 512KB
+    DEFINE_FLOAT_RESULT_MATRIX(random_access_latency,test_range_size_kb_pow2,RANDOM_ACCESS_MATRIX_SIZE_MAX_POW2_KB,access_size_kb_pow2,RANDOM_ACCESS_MATRIX_ACCESS_MAX_POW2_KB);
+    FOR(x,RANDOM_ACCESS_MATRIX_SIZE_MAX_POW2_KB) { random_access_latency_row_array[x] = x; }
+    FOR(x,RANDOM_ACCESS_MATRIX_ACCESS_MAX_POW2_KB) { random_access_latency_column_array[x] = x; }
+    for (int i = 0; i < RANDOM_ACCESS_MATRIX_SIZE_MAX_POW2_KB; i++) {
+        for (int j = 0; j < RANDOM_ACCESS_MATRIX_ACCESS_MAX_POW2_KB; j++) {
+            uint64_t access_size = (1<<j)*KB;
+            uint64_t num_access = access_size / sizeof(uint64_t);
+            uint64_t test_range = (1<<i)*KB;
+            test_random_access_latency(num_access, test_range, sizeof(uint64_t), 1, 1, 0); //warmup
+            random_access_latency_result_array[i][j] = test_random_access_latency(num_access, test_range, sizeof(uint64_t), 1, 1, 0); //test
+        }
+    }
+    print_float_result_matrix(&random_access_latency_matrix_meta);
+}
diff --git a/apps/maprobe/main.c b/apps/maprobe/main.c
new file mode 100644
index 0000000000000000000000000000000000000000..eb973e7ba6a763330f7fecb0757c8bd6127f477d
--- /dev/null
+++ b/apps/maprobe/main.c
@@ -0,0 +1,256 @@
+#include <klib.h>
+#include "maprobe.h"
+
+void typical_linear_load_test_set()
+{
+    _perf_calibrate();
+    printf("------------- linear load test set -------------\n");
+    printf("page size linear double word load:\n");
+    test_linear_access_latency(_PERF_PAGE_SIZE_BYTE, sizeof(uint64_t), 1, 0);
+    test_linear_access_latency(_PERF_PAGE_SIZE_BYTE, sizeof(uint64_t), 2, 0);
+    printf("page size linear cache line load:\n");
+    test_linear_access_latency(_PERF_PAGE_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 1, 0);
+    test_linear_access_latency(_PERF_PAGE_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 2, 0);
+    printf("dcache/2 linear double word load:\n");
+    test_linear_access_latency(_PERF_L1_SIZE_BYTE / 2, sizeof(uint64_t), 1, 0);
+    test_linear_access_latency(_PERF_L1_SIZE_BYTE / 2, sizeof(uint64_t), 2, 0);
+    printf("dcache/2 linear cache line load:\n");
+    test_linear_access_latency(_PERF_L1_SIZE_BYTE / 2, _PERF_CACHELINE_SIZE_BYTE, 1, 0);
+    test_linear_access_latency(_PERF_L1_SIZE_BYTE / 2, _PERF_CACHELINE_SIZE_BYTE, 2, 0);
+    printf("dcache linear double word load:\n");
+    test_linear_access_latency(_PERF_L1_SIZE_BYTE, sizeof(uint64_t), 1, 0);
+    test_linear_access_latency(_PERF_L1_SIZE_BYTE, sizeof(uint64_t), 2, 0);
+    printf("dcache linear cache line load:\n");
+    test_linear_access_latency(_PERF_L1_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 1, 0);
+    test_linear_access_latency(_PERF_L1_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 2, 0);
+    printf("L2 linear cache line load:\n");
+    test_linear_access_latency(_PERF_L2_SIZE_BYTE / 2, _PERF_CACHELINE_SIZE_BYTE, 1, 0);
+    test_linear_access_latency(_PERF_L2_SIZE_BYTE / 2, _PERF_CACHELINE_SIZE_BYTE, 2, 0);
+    printf("L1 (L1 same set) linear cache line load:\n");
+    test_linear_access_latency(_PERF_L1_SIZE_BYTE, _PERF_ADDR_STRIDE_L1_SAME_SET, 10, 0);
+    test_linear_access_latency(_PERF_L1_SIZE_BYTE, _PERF_ADDR_STRIDE_L1_SAME_SET, 100, 0);
+    printf("L2 (L1 same set) linear cache line load:\n");
+    test_linear_access_latency(_PERF_L2_SIZE_BYTE, _PERF_ADDR_STRIDE_L1_SAME_SET, 2, 0);
+    test_linear_access_latency(_PERF_L2_SIZE_BYTE, _PERF_ADDR_STRIDE_L1_SAME_SET, 4, 0);
+    printf("L1 (L2 same slice) linear cache line load:\n");
+    test_linear_access_latency(_PERF_L1_SIZE_BYTE, _PERF_ADDR_STRIDE_L2_SAME_SLICE, 1, 0);
+    test_linear_access_latency(_PERF_L1_SIZE_BYTE, _PERF_ADDR_STRIDE_L2_SAME_SLICE, 2, 0);
+    printf("L2 (L2 same slice) linear cache line load:\n");
+    test_linear_access_latency(_PERF_L2_SIZE_BYTE, _PERF_ADDR_STRIDE_L2_SAME_SLICE, 1, 0);
+    test_linear_access_latency(_PERF_L2_SIZE_BYTE, _PERF_ADDR_STRIDE_L2_SAME_SLICE, 2, 0);
+    printf("L1 (page traverse) linear cache line load:\n");
+    test_linear_access_latency(_PERF_L1_SIZE_BYTE, _PERF_ADDR_STRIDE_NEXT_PAGE, 10, 0);
+    test_linear_access_latency(_PERF_L1_SIZE_BYTE, _PERF_ADDR_STRIDE_NEXT_PAGE, 100, 0);
+    printf("L2 (page traverse) linear cache line load:\n");
+    test_linear_access_latency(_PERF_L2_SIZE_BYTE, _PERF_ADDR_STRIDE_NEXT_PAGE, 2, 0);
+    test_linear_access_latency(_PERF_L2_SIZE_BYTE, _PERF_ADDR_STRIDE_NEXT_PAGE, 4, 0);
+    printf("total samples: %ld\n", _perf_g_total_samples);
+}
+
+void typical_random_load_test_set()
+{
+    printf("------------- random load test set -------------\n");
+    printf("from page size random load (word):\n");
+    test_random_access_latency(1024, _PERF_PAGE_SIZE_BYTE, 8*BYTE, 1, 1, 0);
+    test_random_access_latency(1024, _PERF_PAGE_SIZE_BYTE, 8*BYTE, 0, 1, 0);
+    printf("from page size random load (cache line):\n");
+    test_random_access_latency(1024, _PERF_PAGE_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 1, 1, 0);
+    test_random_access_latency(1024, _PERF_PAGE_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 0, 1, 0);
+    printf("from dcache/2 size random load (word):\n");
+    test_random_access_latency(1024, _PERF_L1_SIZE_BYTE/2, 8*BYTE, 1, 1, 0);
+    test_random_access_latency(1024, _PERF_L1_SIZE_BYTE/2, 8*BYTE, 0, 1, 0);
+    printf("from dcache/2 size random load (cache line):\n");
+    test_random_access_latency(1024, _PERF_L1_SIZE_BYTE/2, _PERF_CACHELINE_SIZE_BYTE, 1, 1, 0);
+    test_random_access_latency(1024, _PERF_L1_SIZE_BYTE/2, _PERF_CACHELINE_SIZE_BYTE, 0, 1, 0);
+    printf("from dcache size random load (word):\n");
+    test_random_access_latency(_PERF_L1_SIZE_BYTE/_PERF_CACHELINE_SIZE_BYTE*2, _PERF_L1_SIZE_BYTE, 8*BYTE, 1, 1, 0);
+    test_random_access_latency(_PERF_L1_SIZE_BYTE/_PERF_CACHELINE_SIZE_BYTE*2, _PERF_L1_SIZE_BYTE, 8*BYTE, 0, 1, 0);
+    printf("from dcache size random load (cache line):\n");
+    test_random_access_latency(_PERF_L1_SIZE_BYTE/_PERF_CACHELINE_SIZE_BYTE*2, _PERF_L1_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 1, 1, 0);
+    test_random_access_latency(_PERF_L1_SIZE_BYTE/_PERF_CACHELINE_SIZE_BYTE*2, _PERF_L1_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 0, 1, 0);
+    printf("from l2 size random load (word):\n");
+    test_random_access_latency(_PERF_L1_SIZE_BYTE/_PERF_CACHELINE_SIZE_BYTE*2, _PERF_L2_SIZE_BYTE, 8*BYTE, 1, 1, 0);
+    test_random_access_latency(_PERF_L1_SIZE_BYTE/_PERF_CACHELINE_SIZE_BYTE*2, _PERF_L2_SIZE_BYTE, 8*BYTE, 0, 1, 0);
+    printf("from l2 size random load (cache line):\n");
+    test_random_access_latency(_PERF_L1_SIZE_BYTE/_PERF_CACHELINE_SIZE_BYTE*2, _PERF_L2_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 1, 1, 0);
+    test_random_access_latency(_PERF_L1_SIZE_BYTE/_PERF_CACHELINE_SIZE_BYTE*2, _PERF_L2_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 0, 1, 0);
+    printf("total samples: %ld\n", _perf_g_total_samples);
+}
+
+void typical_pointer_tracing_load_test_set()
+{
+    printf("------------- pointer tracing load test set -------------\n");
+    printf("dobule word by dobule word tracing:\n");
+    test_pointer_tracing_latency(_PERF_PAGE_SIZE_BYTE, 8*BYTE, 10, 0);
+    test_pointer_tracing_latency(_PERF_L1_SIZE_BYTE/2, 8*BYTE, 2, 0);
+    test_pointer_tracing_latency(_PERF_L1_SIZE_BYTE, 8*BYTE, 2, 0);
+    test_pointer_tracing_latency(_PERF_L2_SIZE_BYTE/2, 8*BYTE, 2, 0);
+    printf("cacheline by cacheline tracing:\n");
+    test_pointer_tracing_latency(_PERF_PAGE_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 10, 0);
+    test_pointer_tracing_latency(_PERF_L1_SIZE_BYTE/2, _PERF_CACHELINE_SIZE_BYTE, 2, 0);
+    test_pointer_tracing_latency(_PERF_L1_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 2, 0);
+    test_pointer_tracing_latency(_PERF_L2_SIZE_BYTE/2, _PERF_CACHELINE_SIZE_BYTE, 2, 0);
+    test_pointer_tracing_latency(_PERF_L2_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 1, 0);
+    test_pointer_tracing_latency(_PERF_L3_SIZE_BYTE/2, _PERF_CACHELINE_SIZE_BYTE, 1, 0);
+    printf("page by page, tracing:\n");
+    test_pointer_tracing_latency(_PERF_PAGE_SIZE_BYTE*2, _PERF_PAGE_SIZE_BYTE, 10, 0);
+    test_pointer_tracing_latency(_PERF_L1_SIZE_BYTE/2, _PERF_PAGE_SIZE_BYTE, 10, 0);
+    test_pointer_tracing_latency(_PERF_L1_SIZE_BYTE, _PERF_PAGE_SIZE_BYTE, 10, 0);
+    test_pointer_tracing_latency(_PERF_L2_SIZE_BYTE/2, _PERF_PAGE_SIZE_BYTE, 10, 0);
+    test_pointer_tracing_latency(_PERF_L2_SIZE_BYTE, _PERF_PAGE_SIZE_BYTE, 10, 0);
+    printf("total samples: %ld\n", _perf_g_total_samples);
+}
+
+void typical_memory_disambiuation_test_set()
+{
+    printf("------------- memory disambiuation test set -------------\n");
+    printf("load from the same address:\n");
+    test_same_address_load_latency(1024, 0);
+    test_same_address_load_latency(1024, 0);
+    test_same_address_load_latency(1024, 0);
+    printf("load then store to the same address:\n");
+    test_read_after_write_latency(1024, 0);
+    test_read_after_write_latency(1024, 0);
+    test_read_after_write_latency(1024, 0);
+    // more to be added
+}
+
+void typical_l1_access_test_set()
+{
+    printf("------------- typical dcache access pattern test set -------------\n");
+    printf("ideal load bandwidth:\n");
+    test_l1_load_bandwidth(_PERF_L1_SIZE_BYTE, 2, 0);
+    test_l1_load_bandwidth(_PERF_L1_SIZE_BYTE, 10, 0);
+    printf("ideal store bandwidth:\n");
+    test_l1_store_bandwidth(_PERF_L1_SIZE_BYTE, 2, 0);
+    test_l1_store_bandwidth(_PERF_L1_SIZE_BYTE, 10, 0);
+    printf("ideal write combine buffer bandwidth:\n");
+    test_l1_store_wcb_bandwidth(_PERF_L1_SIZE_BYTE, 2, 0);
+    test_l1_store_wcb_bandwidth(_PERF_L1_SIZE_BYTE, 5, 0);
+    printf("replacement error penalty:\n");
+    test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*32,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0);
+    test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*32,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0);
+}
+
+// typical latency test for fast regression
+void typical_latency_test()
+{
+    _perf_g_total_samples = 0;
+    typical_l1_access_test_set();
+    typical_linear_load_test_set();
+    typical_random_load_test_set();
+    typical_pointer_tracing_load_test_set();
+    typical_memory_disambiuation_test_set();
+}
+
+void pointer_tracing_graph()
+{
+    _perf_g_total_samples = 0;
+    _perf_calibrate();
+    printf("data for pointer tracing latency graph:\n");
+    printf("range (B), read latency, iters, samples\n");
+    for (int i = 1*KB; i < 64*KB; i = i + 1*KB) {
+        test_pointer_tracing_latency(i, _PERF_CACHELINE_SIZE_BYTE, 2, 1);
+    }
+    for (int i = 64*KB; i < 1024*KB; i = i + 64*KB) {
+        test_pointer_tracing_latency(i, _PERF_CACHELINE_SIZE_BYTE, 1, 1);
+    }
+    test_pointer_tracing_latency(1024*KB, _PERF_CACHELINE_SIZE_BYTE, 1, 1);
+    for (int i = 1*MB; i <8*MB; i = i + 1*MB) {
+        test_pointer_tracing_latency(i, _PERF_CACHELINE_SIZE_BYTE, 1, 1);
+    }
+    printf("total samples: %ld\n", _perf_g_total_samples);
+}
+
+// a simple test set used to check if test is working correctly
+void latency_test_example()
+{
+    _perf_calibrate();
+    printf("latency test example:\n");
+    test_l1_load_bandwidth(4*KB, 5, 0);
+    test_l1_load_bandwidth(4*KB, 5, 0);
+    test_l1_store_bandwidth(4*KB, 5, 0);
+    test_l1_store_bandwidth(4*KB, 5, 0);
+    test_l1_store_wcb_bandwidth(8*KB, 5, 0);
+    test_l1_store_wcb_bandwidth(8*KB, 5, 0);
+    test_pointer_tracing_latency(_PERF_PAGE_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 5, 0);
+    test_linear_access_latency(_PERF_PAGE_SIZE_BYTE, sizeof(uint64_t), 5, 0);
+    test_linear_access_latency(_PERF_PAGE_SIZE_BYTE, sizeof(uint64_t), 5, 0);
+    test_linear_access_latency(_PERF_PAGE_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 5, 0);
+    test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*4,_PERF_ADDR_STRIDE_L1_SAME_SET,8,0);
+    test_random_access_latency(4096, 1024*MB, _PERF_CACHELINE_SIZE_BYTE, 0, 1, 0);
+    test_random_access_latency(4096, 1024*MB, _PERF_CACHELINE_SIZE_BYTE, 1, 1, 0);
+    test_same_address_load_latency(1024, 0);
+    test_read_after_write_latency(1024, 0);
+    printf("total samples: %ld\n", _perf_g_total_samples);
+}
+
+void l2_l3_pressure_test()
+{
+    _perf_calibrate();
+    printf("L2 and L3 same set pressure test:\n");
+    for (int i = 1; i < 16; i++) {
+        printf("ways accessed: %d\n", i);
+        test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L2_SAME_SET*i,_PERF_ADDR_STRIDE_L2_SAME_SET,64,0);
+        test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L2_SAME_SET*i,_PERF_ADDR_STRIDE_L2_SAME_SET,64,0);
+    }
+    for (int i = 16; i <= 512; i*=2) {
+        printf("ways accessed: %d\n", i);
+        // jump at i = 32
+        test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L2_SAME_SET*i,_PERF_ADDR_STRIDE_L2_SAME_SET,64,0);
+    }
+
+    test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*32,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0);
+    test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*32,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0);
+    test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*64,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0);
+    test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*64,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0);
+    // jump at i = 128
+    test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*128,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0);
+    test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*128,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0);
+    test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*256,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0);
+    test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*256,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0);
+    test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*512,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0);
+    test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*512,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0);
+}
+
+void legacy_latency_throughput_test()
+{
+    _perf_calibrate();
+    printf("Memory throughput:\n");
+    legacy_test_mem_throughput(1024);
+    printf("L1 latency:\n");
+    test_pointer_tracing_latency(_PERF_PAGE_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 5, 0);
+    test_pointer_tracing_latency(_PERF_L1_NOALIAS_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 2, 0);
+    test_pointer_tracing_latency(_PERF_L1_SIZE_BYTE/2, _PERF_CACHELINE_SIZE_BYTE, 2, 0);
+    test_pointer_tracing_latency(_PERF_L1_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 2, 0);
+    printf("L2 latency:\n");
+    test_pointer_tracing_latency(_PERF_L2_SIZE_BYTE/2, _PERF_CACHELINE_SIZE_BYTE, 2, 0);
+    // test_pointer_tracing_latency(_PERF_L2_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 2, 0);
+    printf("L3 latency:\n");
+    test_pointer_tracing_latency(_PERF_L3_SIZE_BYTE/2, _PERF_CACHELINE_SIZE_BYTE, 2, 0);
+    // test_pointer_tracing_latency(_PERF_L3_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE,2, 0);
+    // printf("MEM:\n");
+    // test_pointer_tracing_latency(_PERF_L3_SIZE_BYTE*2, _PERF_CACHELINE_SIZE_BYTE,2, 0);
+    printf("total sampl8es: %ld\n", _perf_g_total_samples);
+}
+
+int main()
+{
+    latency_test_example();
+
+    generate_linear_access_latency_matrix(8*BYTE);
+    generate_linear_access_latency_matrix(_PERF_CACHELINE_SIZE_BYTE);
+    generate_pointer_tracing_latency_matrix(8*BYTE);
+    generate_pointer_tracing_latency_matrix(_PERF_CACHELINE_SIZE_BYTE);
+    // generate_random_access_latency_matrix();
+    generate_replacement_test_matrix();
+
+    // matrix_print_example();
+    typical_latency_test();
+    // pointer_tracing_graph();
+    // latency_test();
+    // legacy_latency_throughput_test();
+    l2_l3_pressure_test();
+    return 0;
+    // return 0;
+}
\ No newline at end of file
diff --git a/apps/maprobe/maprobe.c b/apps/maprobe/maprobe.c
deleted file mode 100644
index 16563d7836a93cd0199d70be6cf5225b960acdac..0000000000000000000000000000000000000000
--- a/apps/maprobe/maprobe.c
+++ /dev/null
@@ -1,24 +0,0 @@
-#include <klib.h>
-#include "maprobe.h"
-
-int main()
-{
-    _perf_calibrate();
-    printf("Memory throughput:\n");
-    test_mem_throughput(512);
-    printf("L1 latency:\n");
-    test_latency(4 * KB, 5);
-    test_latency(_PERF_L1_NOALIAS_SIZE_BYTE, 2);
-    test_latency(_PERF_L1_SIZE_BYTE/2, 2);
-    test_latency(_PERF_L1_SIZE_BYTE, 2);
-    printf("L2 latency:\n");
-    test_latency(_PERF_L2_SIZE_BYTE/2, 2);
-    // test_latency(_PERF_L2_SIZE_BYTE, 2);
-    printf("L3 latency:\n");
-    test_latency(_PERF_L3_SIZE_BYTE/2, 2);
-    // test_latency(_PERF_L3_SIZE_BYTE,2);
-    // printf("MEM:\n");
-    // test_latency(_PERF_L3_SIZE_BYTE*2,2);
-
-    return 0;
-}
\ No newline at end of file
diff --git a/apps/maprobe/replacement-test.c b/apps/maprobe/replacement-test.c
new file mode 100644
index 0000000000000000000000000000000000000000..641bce8bd3dd2e729477fc8bac3562fec12672f2
--- /dev/null
+++ b/apps/maprobe/replacement-test.c
@@ -0,0 +1,20 @@
+#include "maprobe.h"
+
+void generate_replacement_test_matrix()
+{
+#define REPLACEMENT_TEST_MAX_WAY 17 // up to 16 way
+#define REPLACEMENT_TEST_ITER 5 // 1 warmup + 4 test
+    assert(REPLACEMENT_TEST_ITER >= 2);
+    DEFINE_FLOAT_RESULT_MATRIX(replacement_test,num_way_accessed,REPLACEMENT_TEST_MAX_WAY,iter,REPLACEMENT_TEST_ITER);
+    FOR(x,REPLACEMENT_TEST_ITER) { replacement_test_column_array[x] = x; }
+    for (int i = 0; i < REPLACEMENT_TEST_MAX_WAY; i++) {
+        replacement_test_row_array[i] = i+1;
+        int warm_up_iter = 64;
+        int test_iter = i < 4 ? 256 : 64;
+        replacement_test_result_array[i][0] = test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*(i+1),_PERF_ADDR_STRIDE_L1_SAME_SET,warm_up_iter,0); //warmup
+        for(int j = 1; j < REPLACEMENT_TEST_ITER; j++) {
+            replacement_test_result_array[i][j] = test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*(i+1),_PERF_ADDR_STRIDE_L1_SAME_SET,test_iter,0); //test
+        }
+    }
+    print_float_result_matrix(&replacement_test_matrix_meta);
+}
\ No newline at end of file
diff --git a/apps/maprobe/resultmat.c b/apps/maprobe/resultmat.c
new file mode 100644
index 0000000000000000000000000000000000000000..a264f182e472c889fac4d75a461f03d0a5df2551
--- /dev/null
+++ b/apps/maprobe/resultmat.c
@@ -0,0 +1,55 @@
+#include "resultmat.h"
+
+void print_float_result_matrix(struct result_matrix_meta* meta)
+{
+    assert(meta);
+    printf("---------- %s matrix start ----------\n", meta->name);
+    printf("%s (row) \\ %s (column)\n", meta->row_name, meta->column_name);
+    if (meta->column_array) {
+        if (meta->row_array) {
+            printf("\\  ,   \t");
+        }
+        for (int c = 0; c < meta->column_size; c++) {
+            printf("      %d,\t", *((int*)meta->column_array + c));
+        }
+        printf("\n");
+    }
+    for (int r = 0; r < meta->row_size; r++) {
+        if (meta->row_array) {
+            printf("%3d,\t", *((int*)meta->row_array + r));
+        }
+        for (int c = 0; c < meta->column_size; c++) {
+            printf("%f,\t", *((float*)meta->result_array + r * meta->column_size + c));
+        }
+        printf("\n");
+    }
+    printf("---------- %s matrix end ----------\n");
+}
+
+void matrix_print_example()
+{
+    DEFINE_FLOAT_RESULT_MATRIX(test,testrow,5,testcol,10);
+    // ({ 
+    //     struct result_matrix_meta test_matrix_meta;
+    //     float test_result_array[5][10] = {0};
+    //     int test_column_array[10] = {0};
+    //     int testrow_array[5] = {0};
+    //     test_matrix_meta.name = "test";
+    //     test_matrix_meta.column_name = "testcol";
+    //     test_matrix_meta.row_name = "testrow";
+    //     test_matrix_meta.column_size = 10;
+    //     test_matrix_meta.row_size = 5;
+    //     test_matrix_meta.result_array = test_result_array;
+    //     test_matrix_meta.column_array = test_column_array;
+    //     test_matrix_meta.row_array = test_row_array;
+    // })
+
+    FOR(x,5) { test_row_array[x] = x; }
+    FOR(x,10) { test_column_array[x] = x; }
+    FOR(x,5) {
+        FOR(y,10) {
+            test_result_array[x][y] = rand();
+        }
+    }
+    print_float_result_matrix(&test_matrix_meta);
+}
\ No newline at end of file