未验证 提交 4cdfb7b0 编写于 作者: W William Wang 提交者: GitHub

Merge pull request #32 from OpenXiangShan/maprobe-230305

maprobe: add basic lsu microbenchmark
NAME = maprobe
SRCS = maprobe.c
SRCS = common.c bitutils.c resultmat.c latency-test.c bandwidth-test.c replacement-test.c main.c
include $(AM_HOME)/Makefile.app
# Micro Architecture Probe (MAProbe)
* Memory access latency test
\ No newline at end of file
#include "maprobe.h"
float test_l1_load_bandwidth(uint64_t size, int iter, int to_csv)
{
// printf("stride %d linear access latency test\n", step);
// printf("range (B), read latency, iters, samples, cycles\n");
assert(size >= _PERF_CACHELINE_SIZE_BYTE);
// _perf_print_timer();
_perf_start_timer();
for (int i = 0; i < iter; i++) {
for (uint64_t address = _PERF_TEST_ADDR_BASE; address < _PERF_TEST_ADDR_BASE + size; address += _PERF_CACHELINE_SIZE_BYTE) {
__asm__ volatile ("ld a0, 0(%[addr])\n" :: [addr] "r"(address) : "a0");
__asm__ volatile ("ld a0, 8(%[addr])\n" :: [addr] "r"(address) : "a0");
__asm__ volatile ("ld a0, 16(%[addr])\n" :: [addr] "r"(address) : "a0");
__asm__ volatile ("ld a0, 24(%[addr])\n" :: [addr] "r"(address) : "a0");
__asm__ volatile ("ld a0, 32(%[addr])\n" :: [addr] "r"(address) : "a0");
__asm__ volatile ("ld a0, 40(%[addr])\n" :: [addr] "r"(address) : "a0");
__asm__ volatile ("ld a0, 48(%[addr])\n" :: [addr] "r"(address) : "a0");
__asm__ volatile ("ld a0, 56(%[addr])\n" :: [addr] "r"(address) : "a0");
}
}
_perf_end_timer();
// _perf_print_timer();
uint64_t total_access = size / _PERF_CACHELINE_SIZE_BYTE * 8 * iter;
float bandwidth = total_access * 8 * BYTE / (float)perf.cycle;
if (to_csv) {
printf("%ld, %f, %d, %ld, %ld\n", size, (float)perf.cycle / total_access, iter, total_access, perf.cycle);
} else {
printf("range %ldKB (%d iters) dcache linear (8Byte) read, latency %f, throughput %f B/cycle (%ld samples, %ld cycles), stride %dB\n",
size/KB, iter, (float)perf.cycle / total_access, bandwidth, total_access, perf.cycle, 8
);
}
_perf_g_total_samples += total_access;
return bandwidth;
}
float test_l1_store_bandwidth(uint64_t size, int iter, int to_csv)
{
// printf("stride %d linear access latency test\n", step);
// printf("range (B), read latency, iters, samples, cycles\n");
assert(size >= _PERF_CACHELINE_SIZE_BYTE);
// _perf_print_timer();
_perf_start_timer();
for (int i = 0; i < iter; i++) {
for (uint64_t address = _PERF_TEST_ADDR_BASE; address < _PERF_TEST_ADDR_BASE + size; address += _PERF_CACHELINE_SIZE_BYTE) {
__asm__ volatile ("sd a0, 0(%[addr])\n" :: [addr] "r"(address) : "a0");
__asm__ volatile ("sd a0, 8(%[addr])\n" :: [addr] "r"(address) : "a0");
__asm__ volatile ("sd a0, 16(%[addr])\n" :: [addr] "r"(address) : "a0");
__asm__ volatile ("sd a0, 24(%[addr])\n" :: [addr] "r"(address) : "a0");
__asm__ volatile ("sd a0, 32(%[addr])\n" :: [addr] "r"(address) : "a0");
__asm__ volatile ("sd a0, 40(%[addr])\n" :: [addr] "r"(address) : "a0");
__asm__ volatile ("sd a0, 48(%[addr])\n" :: [addr] "r"(address) : "a0");
__asm__ volatile ("sd a0, 56(%[addr])\n" :: [addr] "r"(address) : "a0");
}
}
_perf_end_timer();
// _perf_print_timer();
uint64_t total_access = size / _PERF_CACHELINE_SIZE_BYTE * 8 * iter;
float bandwidth = total_access * 8 * BYTE / (float)perf.cycle;
if (to_csv) {
printf("%ld, %f, %d, %ld, %ld\n", size, (float)perf.cycle / total_access, iter, total_access, perf.cycle);
} else {
printf("range %ldKB (%d iters) dcache linear (8Byte) store latency %f, throughput %f B/cycle (%ld samples, %ld cycles), stride %dB\n",
size/KB, iter, (float)perf.cycle / total_access, bandwidth, total_access, perf.cycle, 8
);
}
_perf_g_total_samples += total_access;
return bandwidth;
}
float test_l1_store_wcb_bandwidth(uint64_t size, int iter, int to_csv)
{
// printf("stride %d linear access latency test\n", step);
// printf("range (B), read latency, iters, samples, cycles\n");
assert(size >= _PERF_CACHELINE_SIZE_BYTE);
// _perf_print_timer();
_perf_start_timer();
for (int i = 0; i < iter; i++) {
for (uint64_t address = _PERF_TEST_ADDR_BASE; address < _PERF_TEST_ADDR_BASE + size; address += _PERF_CACHELINE_SIZE_BYTE) {
__asm__ volatile ("sd a0, 0(%[addr])\n" :: [addr] "r"(address) : "a0");
}
}
_perf_end_timer();
// _perf_print_timer();
uint64_t total_access = size / _PERF_CACHELINE_SIZE_BYTE * iter;
float bandwidth = total_access * _PERF_CACHELINE_SIZE_BYTE / (float)perf.cycle;
if (to_csv) {
printf("%ld, %f, %d, %ld, %ld\n", size, (float)perf.cycle / total_access, iter, total_access, perf.cycle);
} else {
printf("range %ldKB (%d iters) dcache linear (8Byte) store latency %f, throughput %f B/cycle (L1-L2 %f B/cycle) (%ld samples, %ld cycles), stride %dB\n",
size/KB, iter, (float)perf.cycle / total_access, total_access * 8 * BYTE / (float)perf.cycle, total_access * _PERF_CACHELINE_SIZE_BYTE / (float)perf.cycle, total_access, perf.cycle, _PERF_CACHELINE_SIZE_BYTE
);
}
_perf_g_total_samples += total_access;
return bandwidth;
}
\ No newline at end of file
#include "bitutils.h"
inline uint64_t _perf_get_bit_mask(int low, int high) {
assert(low < high);
assert(low >= 0);
assert(high < 63);
return ((1 << high) - 1) >> low << low;
}
inline uint64_t _perf_get_bits(uint64_t raw_data, int low, int high) {
assert(low < high);
assert(low >= 0);
assert(high < 63);
uint64_t mask = (1 << high) - 1;
return (raw_data & mask) >> low;
}
inline uint64_t _perf_get_bit(uint64_t raw_data, int position) {
assert(position >= 0);
assert(position <= 63);
return (raw_data >> position) & 1;
}
inline uint64_t _perf_set_bits(uint64_t raw_data, int low, int high, uint64_t new_value) {
assert(low < high);
assert(low >= 0);
assert(high < 63);
int mask = _perf_get_bit_mask(low, high);
return (raw_data & mask) | ((new_value << low) & mask);
}
inline uint64_t _perf_set_bit(uint64_t raw_data, int position, int new_value) {
assert(position >= 0);
assert(position <= 63);
return raw_data & ((new_value & 1) << position);
}
#include "maprobe.h"
struct perf perf;
uint64_t _perf_g_total_samples = 0;
void _perf_start_timer()
{
#ifndef PERF_SIM
perf.instrcnt = csr_read(CSR_MINSTRET);
perf.cycle = csr_read(CSR_MCYCLE);
#endif
}
void _perf_end_timer()
{
#ifndef PERF_SIM
perf.cycle = csr_read(CSR_MCYCLE) - perf.cycle;
perf.instrcnt = csr_read(CSR_MINSTRET) - perf.instrcnt;
#endif
}
void _perf_print_timer()
{
printf("cycle %d inst %d ipc %lf\n", perf.cycle, perf.instrcnt, (float)perf.instrcnt/perf.cycle);
}
void _perf_calibrate()
{
#ifndef PERF_SIM
// csr read delay
uint64_t cycle_1 = csr_read(CSR_MCYCLE);
uint64_t cycle_2 = csr_read(CSR_MCYCLE);
perf.csr_read_cycle = cycle_2-cycle_1;
printf("perf_calibrate: csr_read_cycle %d\n", perf.csr_read_cycle);
// csr read inst cost
uint64_t inst_1 = csr_read(CSR_MINSTRET);
uint64_t inst_2 = csr_read(CSR_MINSTRET);
perf.csr_read_ninst = inst_2-inst_1;
printf("perf_calibrate: csr_read_ninst %d\n", perf.csr_read_ninst);
#else
printf("running in simulation environment, hpm read disabled\n");
#endif
}
void _perf_blackhole(uint64_t value)
{
*(uint64_t*) _PERF_BLACKHOLE = value;
}
// bit op utils for perf
#ifndef PROBE_BITUTILS_H
#define PROBE_BITUTILS_H
#include <klib.h>
extern uint64_t _perf_get_bit_mask(int low, int high);
extern uint64_t _perf_get_bits(uint64_t raw_data, int low, int high);
extern uint64_t _perf_get_bit(uint64_t raw_data, int position);
extern uint64_t _perf_set_bits(uint64_t raw_data, int low, int high, uint64_t new_value);
extern uint64_t _perf_set_bit(uint64_t raw_data, int position, int new_value);
#endif
\ No newline at end of file
......@@ -5,6 +5,11 @@
#include <klib.h>
#include <csr.h>
#include "bitutils.h"
#include "resultmat.h"
// config
// #define PERF_SIM // probe run in simulatior, diaable perf counters
// perf const
#define BYTE (1)
......@@ -13,15 +18,29 @@
#define GB (1024*MB)
// platform dependent const
// #define _PERF_TEST_ADDR_BASE 0x80400000
#define _PERF_TEST_ADDR_BASE 0x2000400000
#ifndef _PERF_TEST_ADDR_BASE
#define _PERF_TEST_ADDR_BASE 0x80400000
// #define _PERF_TEST_ADDR_BASE 0x2000400000
#endif
#define _PERF_CACHELINE_SIZE_BYTE (64 * BYTE)
#define _PERF_L1_NOALIAS_SIZE_BYTE (32 * KB)
#define _PERF_L1_SIZE_BYTE (128 * KB)
#define _PERF_L2_SIZE_BYTE (512 * KB)
#define _PERF_L3_SIZE_BYTE (2 * MB)
#define _PERF_L1_NUM_WAYS 8
#define _PERF_SET_SIZE_BYTE (_PERF_L1_SIZE_BYTE / _PERF_L1_NUM_WAYS)
#define _PERF_PAGE_SIZE_BYTE (4 * KB)
#define _PERF_L1_NOALIAS_SIZE_BYTE (16 * KB)
#define _PERF_L1_SIZE_BYTE (64 * KB)
#define _PERF_L2_SIZE_BYTE (1 * MB)
#define _PERF_L3_SIZE_BYTE (6 * MB)
#define _PERF_MEM_SIZE_BYTE (1024 * MB)
#define _PERF_L1_NUM_WAYS 4
#define _PERF_L1_NUM_SETS 256
#define _PERF_L2_NUM_WAYS 8
#define _PERF_L2_NUM_SLICES 4
#define _PERF_L2_NUM_SETS 512
#define _PERF_ADDR_STRIDE_L1_SAME_BANK _PERF_CACHELINE_SIZE_BYTE
#define _PERF_ADDR_STRIDE_L1_SAME_SET (_PERF_L1_NUM_SETS * _PERF_CACHELINE_SIZE_BYTE)
#define _PERF_ADDR_STRIDE_L2_SAME_SLICE (_PERF_L2_NUM_SLICES * _PERF_CACHELINE_SIZE_BYTE)
#define _PERF_ADDR_STRIDE_L1_SAME_SET (_PERF_L1_NUM_SETS * _PERF_CACHELINE_SIZE_BYTE)
#define _PERF_ADDR_STRIDE_L2_SAME_SET (_PERF_L2_NUM_SLICES * _PERF_L2_NUM_SETS * _PERF_CACHELINE_SIZE_BYTE)
#define _PERF_ADDR_STRIDE_NEXT_PAGE (_PERF_PAGE_SIZE_BYTE)
// probe const
#define _PERF_BLACKHOLE _PERF_TEST_ADDR_BASE
......@@ -29,128 +48,51 @@
struct perf
{
// const to be calibrated at run time
uint64_t csr_read_cycle; //# of cycles to read mcycle
uint64_t csr_read_cycle; // # of cycles to read mcycle
uint64_t csr_read_ninst; // # of inst needed to read minstret
// timer
uint64_t cycle;
uint64_t instrcnt;
} perf;
void _perf_start_timer()
{
perf.cycle = csr_read(CSR_MCYCLE);
perf.instrcnt = csr_read(CSR_MINSTRET);
}
void _perf_end_timer()
{
perf.cycle = csr_read(CSR_MCYCLE) - perf.cycle;
perf.instrcnt = csr_read(CSR_MINSTRET) - perf.instrcnt;
}
void _perf_print_timer()
{
printf("cycle %d inst %d ipc %lf\n", perf.cycle, perf.instrcnt, (float)perf.instrcnt/perf.cycle);
}
void _perf_calibrate()
{
// csr read delay
uint64_t cycle_1 = csr_read(CSR_MCYCLE);
uint64_t cycle_2 = csr_read(CSR_MCYCLE);
perf.csr_read_cycle = cycle_2-cycle_1;
printf("perf_calibrate: csr_read_cycle %d\n", perf.csr_read_cycle);
// csr read inst cost
uint64_t inst_1 = csr_read(CSR_MINSTRET);
uint64_t inst_2 = csr_read(CSR_MINSTRET);
perf.csr_read_ninst = inst_2-inst_1;
printf("perf_calibrate: csr_read_ninst %d\n", perf.csr_read_ninst);
}
void _perf_blackhole(uint64_t value)
{
*(uint64_t*) _PERF_BLACKHOLE = value;
}
uint64_t setup_latency_test_linklist(uint64_t base_addr, uint64_t end_addr, uint64_t step)
{
uint64_t num_valid_node = 0;
assert(step % 8 == 0);
assert(step >= 8);
for (uint64_t cur_addr = base_addr; cur_addr < end_addr;) {
uint64_t next_addr = cur_addr + step;
*((uint64_t*)cur_addr) = next_addr;
cur_addr = next_addr;
num_valid_node++;
}
return num_valid_node;
}
uint64_t read_latency_test_linklist(uint64_t base_addr, uint64_t num_valid_node)
{
uint64_t cur_addr = base_addr;
for (int i = 0; i < num_valid_node; i++) {
cur_addr = (*(uint64_t*)cur_addr);
}
return cur_addr;
}
void warmup(uint64_t base_addr, uint64_t end_addr)
{
setup_latency_test_linklist(base_addr, end_addr, _PERF_CACHELINE_SIZE_BYTE);
}
void test_latency(uint64_t size, int iter)
{
volatile uint64_t result = 0; // make sure compiler will not opt read_latency_test_linklist
printf("range 0x%xB (%d iters) latency test\n", size, iter);
_perf_start_timer();
uint64_t nnode = setup_latency_test_linklist(_PERF_TEST_ADDR_BASE, _PERF_TEST_ADDR_BASE + size, _PERF_CACHELINE_SIZE_BYTE);
_perf_end_timer();
uint64_t total_node = nnode * iter;
// _perf_print_timer();
_perf_start_timer();
for (int i = 0; i < iter; i++) {
result += read_latency_test_linklist(_PERF_TEST_ADDR_BASE, nnode);
}
_perf_end_timer();
// _perf_print_timer();
printf("range 0x%xB (%d intrs) read latency %f (%d samples)\n", size, iter, (float)perf.cycle / total_node, total_node);
_perf_blackhole(result);
}
void test_mem_throughput(uint64_t iter)
{
uint64_t remain = iter;
uint64_t result = 0;
uint64_t access_addr = _PERF_TEST_ADDR_BASE;
_perf_start_timer();
while (remain--) {
result += *(uint64_t*) access_addr;
access_addr += _PERF_CACHELINE_SIZE_BYTE;
}
_perf_end_timer();
*(uint64_t*) _PERF_BLACKHOLE = result;
printf("mem band width %f B/cycle (%d samples)\n", (float)iter * _PERF_CACHELINE_SIZE_BYTE / perf.cycle, iter);
}
void test_mem_throughput_same_set(uint64_t iter)
{
uint64_t remain = iter;
uint64_t result = 0;
uint64_t access_addr = _PERF_TEST_ADDR_BASE;
_perf_start_timer();
while (remain--) {
result += *(uint64_t*) access_addr;
access_addr += _PERF_SET_SIZE_BYTE;
}
_perf_end_timer();
*(uint64_t*) _PERF_BLACKHOLE = result;
printf("mem band width %f B/cycle (%d samples)\n", (float)iter * _PERF_CACHELINE_SIZE_BYTE / perf.cycle, iter);
}
};
extern struct perf perf;
extern uint64_t _perf_g_total_samples;
// common perf tools
extern void _perf_start_timer();
extern void _perf_end_timer();
extern void _perf_print_timer();
extern void _perf_calibrate();
extern void _perf_blackhole(uint64_t value);
// latency test
extern uint64_t setup_pointer_tracing_linklist(uint64_t base_addr, uint64_t end_addr, uint64_t step);
extern uint64_t read_pointer_tracing_linklist(uint64_t base_addr, uint64_t num_valid_node);
extern void latency_test_warmup(uint64_t base_addr, uint64_t end_addr);
extern float test_pointer_tracing_latency(uint64_t size, int step, int iter, int to_csv);
extern float test_linear_access_latency(uint64_t size, uint64_t step, int iter, int to_csv);
extern float test_linear_access_latency_simple(uint64_t size, uint64_t step, int iter, int to_csv);
extern float test_linear_access_latency_batch8(uint64_t size, uint64_t step, int iter, int to_csv);
extern float test_random_access_latency(uint64_t num_access, uint64_t test_range, uint64_t test_align, int pregen_addr, int iter, int to_csv);
extern float test_same_address_load_latency(int iter, int to_csv);
extern float test_read_after_write_latency(int iter, int to_csv);
extern float test_linear_write_latency(uint64_t size, uint64_t step, int iter, int to_csv);
// bandwidth test
extern float test_l1_load_bandwidth(uint64_t size, int iter, int to_csv);
extern float test_l1_store_bandwidth(uint64_t size, int iter, int to_csv);
extern float test_l1_store_wcb_bandwidth(uint64_t size, int iter, int to_csv);
// key parameter matrix generate
void generate_linear_access_latency_matrix(uint64_t step);
void generate_pointer_tracing_latency_matrix(uint64_t step);
void generate_random_access_latency_matrix();
void generate_replacement_test_matrix();
// legacy test
extern void legacy_test_mem_throughput(uint64_t iter);
extern void legacy_test_mem_throughput_same_set(uint64_t iter);
#endif
\ No newline at end of file
#ifndef PROBE_RESULT_MATRIX_H
#define PROBE_RESULT_MATRIX_H
#include <klib.h>
struct result_matrix_meta {
char* name;
char* row_name;
char* column_name;
int row_size;
int column_size;
void* result_array;
void* column_array;
void* row_array;
};
void print_float_result_matrix(struct result_matrix_meta* meta);
void matrix_print_example();
#define FOR(v,end) for (int v = 0; v < end; v++)
#define CONCAT(a,b) a##b
#define TOSTR(a) #a
#define DEFINE_FLOAT_RESULT_MATRIX(matrix_name, rowname, rowsize, columnname, columnsize) \
struct result_matrix_meta CONCAT(matrix_name,_matrix_meta); \
float CONCAT(matrix_name,_result_array)[rowsize][columnsize] = {0}; \
int CONCAT(matrix_name,_column_array)[columnsize] = {0}; \
int CONCAT(matrix_name,_row_array)[rowsize] = {0}; \
CONCAT(matrix_name,_matrix_meta).name = TOSTR(matrix_name); \
CONCAT(matrix_name,_matrix_meta).column_name = TOSTR(columnname); \
CONCAT(matrix_name,_matrix_meta).row_name = TOSTR(rowname); \
CONCAT(matrix_name,_matrix_meta).column_size = columnsize; \
CONCAT(matrix_name,_matrix_meta).row_size = rowsize; \
CONCAT(matrix_name,_matrix_meta).result_array = CONCAT(matrix_name,_result_array); \
CONCAT(matrix_name,_matrix_meta).column_array = CONCAT(matrix_name,_column_array); \
CONCAT(matrix_name,_matrix_meta).row_array = CONCAT(matrix_name,_row_array);
#endif
\ No newline at end of file
此差异已折叠。
#include <klib.h>
#include "maprobe.h"
void typical_linear_load_test_set()
{
_perf_calibrate();
printf("------------- linear load test set -------------\n");
printf("page size linear double word load:\n");
test_linear_access_latency(_PERF_PAGE_SIZE_BYTE, sizeof(uint64_t), 1, 0);
test_linear_access_latency(_PERF_PAGE_SIZE_BYTE, sizeof(uint64_t), 2, 0);
printf("page size linear cache line load:\n");
test_linear_access_latency(_PERF_PAGE_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 1, 0);
test_linear_access_latency(_PERF_PAGE_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 2, 0);
printf("dcache/2 linear double word load:\n");
test_linear_access_latency(_PERF_L1_SIZE_BYTE / 2, sizeof(uint64_t), 1, 0);
test_linear_access_latency(_PERF_L1_SIZE_BYTE / 2, sizeof(uint64_t), 2, 0);
printf("dcache/2 linear cache line load:\n");
test_linear_access_latency(_PERF_L1_SIZE_BYTE / 2, _PERF_CACHELINE_SIZE_BYTE, 1, 0);
test_linear_access_latency(_PERF_L1_SIZE_BYTE / 2, _PERF_CACHELINE_SIZE_BYTE, 2, 0);
printf("dcache linear double word load:\n");
test_linear_access_latency(_PERF_L1_SIZE_BYTE, sizeof(uint64_t), 1, 0);
test_linear_access_latency(_PERF_L1_SIZE_BYTE, sizeof(uint64_t), 2, 0);
printf("dcache linear cache line load:\n");
test_linear_access_latency(_PERF_L1_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 1, 0);
test_linear_access_latency(_PERF_L1_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 2, 0);
printf("L2 linear cache line load:\n");
test_linear_access_latency(_PERF_L2_SIZE_BYTE / 2, _PERF_CACHELINE_SIZE_BYTE, 1, 0);
test_linear_access_latency(_PERF_L2_SIZE_BYTE / 2, _PERF_CACHELINE_SIZE_BYTE, 2, 0);
printf("L1 (L1 same set) linear cache line load:\n");
test_linear_access_latency(_PERF_L1_SIZE_BYTE, _PERF_ADDR_STRIDE_L1_SAME_SET, 10, 0);
test_linear_access_latency(_PERF_L1_SIZE_BYTE, _PERF_ADDR_STRIDE_L1_SAME_SET, 100, 0);
printf("L2 (L1 same set) linear cache line load:\n");
test_linear_access_latency(_PERF_L2_SIZE_BYTE, _PERF_ADDR_STRIDE_L1_SAME_SET, 2, 0);
test_linear_access_latency(_PERF_L2_SIZE_BYTE, _PERF_ADDR_STRIDE_L1_SAME_SET, 4, 0);
printf("L1 (L2 same slice) linear cache line load:\n");
test_linear_access_latency(_PERF_L1_SIZE_BYTE, _PERF_ADDR_STRIDE_L2_SAME_SLICE, 1, 0);
test_linear_access_latency(_PERF_L1_SIZE_BYTE, _PERF_ADDR_STRIDE_L2_SAME_SLICE, 2, 0);
printf("L2 (L2 same slice) linear cache line load:\n");
test_linear_access_latency(_PERF_L2_SIZE_BYTE, _PERF_ADDR_STRIDE_L2_SAME_SLICE, 1, 0);
test_linear_access_latency(_PERF_L2_SIZE_BYTE, _PERF_ADDR_STRIDE_L2_SAME_SLICE, 2, 0);
printf("L1 (page traverse) linear cache line load:\n");
test_linear_access_latency(_PERF_L1_SIZE_BYTE, _PERF_ADDR_STRIDE_NEXT_PAGE, 10, 0);
test_linear_access_latency(_PERF_L1_SIZE_BYTE, _PERF_ADDR_STRIDE_NEXT_PAGE, 100, 0);
printf("L2 (page traverse) linear cache line load:\n");
test_linear_access_latency(_PERF_L2_SIZE_BYTE, _PERF_ADDR_STRIDE_NEXT_PAGE, 2, 0);
test_linear_access_latency(_PERF_L2_SIZE_BYTE, _PERF_ADDR_STRIDE_NEXT_PAGE, 4, 0);
printf("total samples: %ld\n", _perf_g_total_samples);
}
void typical_random_load_test_set()
{
printf("------------- random load test set -------------\n");
printf("from page size random load (word):\n");
test_random_access_latency(1024, _PERF_PAGE_SIZE_BYTE, 8*BYTE, 1, 1, 0);
test_random_access_latency(1024, _PERF_PAGE_SIZE_BYTE, 8*BYTE, 0, 1, 0);
printf("from page size random load (cache line):\n");
test_random_access_latency(1024, _PERF_PAGE_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 1, 1, 0);
test_random_access_latency(1024, _PERF_PAGE_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 0, 1, 0);
printf("from dcache/2 size random load (word):\n");
test_random_access_latency(1024, _PERF_L1_SIZE_BYTE/2, 8*BYTE, 1, 1, 0);
test_random_access_latency(1024, _PERF_L1_SIZE_BYTE/2, 8*BYTE, 0, 1, 0);
printf("from dcache/2 size random load (cache line):\n");
test_random_access_latency(1024, _PERF_L1_SIZE_BYTE/2, _PERF_CACHELINE_SIZE_BYTE, 1, 1, 0);
test_random_access_latency(1024, _PERF_L1_SIZE_BYTE/2, _PERF_CACHELINE_SIZE_BYTE, 0, 1, 0);
printf("from dcache size random load (word):\n");
test_random_access_latency(_PERF_L1_SIZE_BYTE/_PERF_CACHELINE_SIZE_BYTE*2, _PERF_L1_SIZE_BYTE, 8*BYTE, 1, 1, 0);
test_random_access_latency(_PERF_L1_SIZE_BYTE/_PERF_CACHELINE_SIZE_BYTE*2, _PERF_L1_SIZE_BYTE, 8*BYTE, 0, 1, 0);
printf("from dcache size random load (cache line):\n");
test_random_access_latency(_PERF_L1_SIZE_BYTE/_PERF_CACHELINE_SIZE_BYTE*2, _PERF_L1_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 1, 1, 0);
test_random_access_latency(_PERF_L1_SIZE_BYTE/_PERF_CACHELINE_SIZE_BYTE*2, _PERF_L1_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 0, 1, 0);
printf("from l2 size random load (word):\n");
test_random_access_latency(_PERF_L1_SIZE_BYTE/_PERF_CACHELINE_SIZE_BYTE*2, _PERF_L2_SIZE_BYTE, 8*BYTE, 1, 1, 0);
test_random_access_latency(_PERF_L1_SIZE_BYTE/_PERF_CACHELINE_SIZE_BYTE*2, _PERF_L2_SIZE_BYTE, 8*BYTE, 0, 1, 0);
printf("from l2 size random load (cache line):\n");
test_random_access_latency(_PERF_L1_SIZE_BYTE/_PERF_CACHELINE_SIZE_BYTE*2, _PERF_L2_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 1, 1, 0);
test_random_access_latency(_PERF_L1_SIZE_BYTE/_PERF_CACHELINE_SIZE_BYTE*2, _PERF_L2_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 0, 1, 0);
printf("total samples: %ld\n", _perf_g_total_samples);
}
void typical_pointer_tracing_load_test_set()
{
printf("------------- pointer tracing load test set -------------\n");
printf("dobule word by dobule word tracing:\n");
test_pointer_tracing_latency(_PERF_PAGE_SIZE_BYTE, 8*BYTE, 10, 0);
test_pointer_tracing_latency(_PERF_L1_SIZE_BYTE/2, 8*BYTE, 2, 0);
test_pointer_tracing_latency(_PERF_L1_SIZE_BYTE, 8*BYTE, 2, 0);
test_pointer_tracing_latency(_PERF_L2_SIZE_BYTE/2, 8*BYTE, 2, 0);
printf("cacheline by cacheline tracing:\n");
test_pointer_tracing_latency(_PERF_PAGE_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 10, 0);
test_pointer_tracing_latency(_PERF_L1_SIZE_BYTE/2, _PERF_CACHELINE_SIZE_BYTE, 2, 0);
test_pointer_tracing_latency(_PERF_L1_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 2, 0);
test_pointer_tracing_latency(_PERF_L2_SIZE_BYTE/2, _PERF_CACHELINE_SIZE_BYTE, 2, 0);
test_pointer_tracing_latency(_PERF_L2_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 1, 0);
test_pointer_tracing_latency(_PERF_L3_SIZE_BYTE/2, _PERF_CACHELINE_SIZE_BYTE, 1, 0);
printf("page by page, tracing:\n");
test_pointer_tracing_latency(_PERF_PAGE_SIZE_BYTE*2, _PERF_PAGE_SIZE_BYTE, 10, 0);
test_pointer_tracing_latency(_PERF_L1_SIZE_BYTE/2, _PERF_PAGE_SIZE_BYTE, 10, 0);
test_pointer_tracing_latency(_PERF_L1_SIZE_BYTE, _PERF_PAGE_SIZE_BYTE, 10, 0);
test_pointer_tracing_latency(_PERF_L2_SIZE_BYTE/2, _PERF_PAGE_SIZE_BYTE, 10, 0);
test_pointer_tracing_latency(_PERF_L2_SIZE_BYTE, _PERF_PAGE_SIZE_BYTE, 10, 0);
printf("total samples: %ld\n", _perf_g_total_samples);
}
void typical_memory_disambiuation_test_set()
{
printf("------------- memory disambiuation test set -------------\n");
printf("load from the same address:\n");
test_same_address_load_latency(1024, 0);
test_same_address_load_latency(1024, 0);
test_same_address_load_latency(1024, 0);
printf("load then store to the same address:\n");
test_read_after_write_latency(1024, 0);
test_read_after_write_latency(1024, 0);
test_read_after_write_latency(1024, 0);
// more to be added
}
void typical_l1_access_test_set()
{
printf("------------- typical dcache access pattern test set -------------\n");
printf("ideal load bandwidth:\n");
test_l1_load_bandwidth(_PERF_L1_SIZE_BYTE, 2, 0);
test_l1_load_bandwidth(_PERF_L1_SIZE_BYTE, 10, 0);
printf("ideal store bandwidth:\n");
test_l1_store_bandwidth(_PERF_L1_SIZE_BYTE, 2, 0);
test_l1_store_bandwidth(_PERF_L1_SIZE_BYTE, 10, 0);
printf("ideal write combine buffer bandwidth:\n");
test_l1_store_wcb_bandwidth(_PERF_L1_SIZE_BYTE, 2, 0);
test_l1_store_wcb_bandwidth(_PERF_L1_SIZE_BYTE, 5, 0);
printf("replacement error penalty:\n");
test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*32,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0);
test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*32,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0);
}
// typical latency test for fast regression
void typical_latency_test()
{
_perf_g_total_samples = 0;
typical_l1_access_test_set();
typical_linear_load_test_set();
typical_random_load_test_set();
typical_pointer_tracing_load_test_set();
typical_memory_disambiuation_test_set();
}
void pointer_tracing_graph()
{
_perf_g_total_samples = 0;
_perf_calibrate();
printf("data for pointer tracing latency graph:\n");
printf("range (B), read latency, iters, samples\n");
for (int i = 1*KB; i < 64*KB; i = i + 1*KB) {
test_pointer_tracing_latency(i, _PERF_CACHELINE_SIZE_BYTE, 2, 1);
}
for (int i = 64*KB; i < 1024*KB; i = i + 64*KB) {
test_pointer_tracing_latency(i, _PERF_CACHELINE_SIZE_BYTE, 1, 1);
}
test_pointer_tracing_latency(1024*KB, _PERF_CACHELINE_SIZE_BYTE, 1, 1);
for (int i = 1*MB; i <8*MB; i = i + 1*MB) {
test_pointer_tracing_latency(i, _PERF_CACHELINE_SIZE_BYTE, 1, 1);
}
printf("total samples: %ld\n", _perf_g_total_samples);
}
// a simple test set used to check if test is working correctly
void latency_test_example()
{
_perf_calibrate();
printf("latency test example:\n");
test_l1_load_bandwidth(4*KB, 5, 0);
test_l1_load_bandwidth(4*KB, 5, 0);
test_l1_store_bandwidth(4*KB, 5, 0);
test_l1_store_bandwidth(4*KB, 5, 0);
test_l1_store_wcb_bandwidth(8*KB, 5, 0);
test_l1_store_wcb_bandwidth(8*KB, 5, 0);
test_pointer_tracing_latency(_PERF_PAGE_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 5, 0);
test_linear_access_latency(_PERF_PAGE_SIZE_BYTE, sizeof(uint64_t), 5, 0);
test_linear_access_latency(_PERF_PAGE_SIZE_BYTE, sizeof(uint64_t), 5, 0);
test_linear_access_latency(_PERF_PAGE_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 5, 0);
test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*4,_PERF_ADDR_STRIDE_L1_SAME_SET,8,0);
test_random_access_latency(4096, 1024*MB, _PERF_CACHELINE_SIZE_BYTE, 0, 1, 0);
test_random_access_latency(4096, 1024*MB, _PERF_CACHELINE_SIZE_BYTE, 1, 1, 0);
test_same_address_load_latency(1024, 0);
test_read_after_write_latency(1024, 0);
printf("total samples: %ld\n", _perf_g_total_samples);
}
void l2_l3_pressure_test()
{
_perf_calibrate();
printf("L2 and L3 same set pressure test:\n");
for (int i = 1; i < 16; i++) {
printf("ways accessed: %d\n", i);
test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L2_SAME_SET*i,_PERF_ADDR_STRIDE_L2_SAME_SET,64,0);
test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L2_SAME_SET*i,_PERF_ADDR_STRIDE_L2_SAME_SET,64,0);
}
for (int i = 16; i <= 512; i*=2) {
printf("ways accessed: %d\n", i);
// jump at i = 32
test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L2_SAME_SET*i,_PERF_ADDR_STRIDE_L2_SAME_SET,64,0);
}
test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*32,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0);
test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*32,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0);
test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*64,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0);
test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*64,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0);
// jump at i = 128
test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*128,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0);
test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*128,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0);
test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*256,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0);
test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*256,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0);
test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*512,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0);
test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*512,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0);
}
void legacy_latency_throughput_test()
{
_perf_calibrate();
printf("Memory throughput:\n");
legacy_test_mem_throughput(1024);
printf("L1 latency:\n");
test_pointer_tracing_latency(_PERF_PAGE_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 5, 0);
test_pointer_tracing_latency(_PERF_L1_NOALIAS_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 2, 0);
test_pointer_tracing_latency(_PERF_L1_SIZE_BYTE/2, _PERF_CACHELINE_SIZE_BYTE, 2, 0);
test_pointer_tracing_latency(_PERF_L1_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 2, 0);
printf("L2 latency:\n");
test_pointer_tracing_latency(_PERF_L2_SIZE_BYTE/2, _PERF_CACHELINE_SIZE_BYTE, 2, 0);
// test_pointer_tracing_latency(_PERF_L2_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 2, 0);
printf("L3 latency:\n");
test_pointer_tracing_latency(_PERF_L3_SIZE_BYTE/2, _PERF_CACHELINE_SIZE_BYTE, 2, 0);
// test_pointer_tracing_latency(_PERF_L3_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE,2, 0);
// printf("MEM:\n");
// test_pointer_tracing_latency(_PERF_L3_SIZE_BYTE*2, _PERF_CACHELINE_SIZE_BYTE,2, 0);
printf("total sampl8es: %ld\n", _perf_g_total_samples);
}
int main()
{
latency_test_example();
generate_linear_access_latency_matrix(8*BYTE);
generate_linear_access_latency_matrix(_PERF_CACHELINE_SIZE_BYTE);
generate_pointer_tracing_latency_matrix(8*BYTE);
generate_pointer_tracing_latency_matrix(_PERF_CACHELINE_SIZE_BYTE);
// generate_random_access_latency_matrix();
generate_replacement_test_matrix();
// matrix_print_example();
typical_latency_test();
// pointer_tracing_graph();
// latency_test();
// legacy_latency_throughput_test();
l2_l3_pressure_test();
return 0;
// return 0;
}
\ No newline at end of file
#include <klib.h>
#include "maprobe.h"
int main()
{
_perf_calibrate();
printf("Memory throughput:\n");
test_mem_throughput(512);
printf("L1 latency:\n");
test_latency(4 * KB, 5);
test_latency(_PERF_L1_NOALIAS_SIZE_BYTE, 2);
test_latency(_PERF_L1_SIZE_BYTE/2, 2);
test_latency(_PERF_L1_SIZE_BYTE, 2);
printf("L2 latency:\n");
test_latency(_PERF_L2_SIZE_BYTE/2, 2);
// test_latency(_PERF_L2_SIZE_BYTE, 2);
printf("L3 latency:\n");
test_latency(_PERF_L3_SIZE_BYTE/2, 2);
// test_latency(_PERF_L3_SIZE_BYTE,2);
// printf("MEM:\n");
// test_latency(_PERF_L3_SIZE_BYTE*2,2);
return 0;
}
\ No newline at end of file
#include "maprobe.h"
void generate_replacement_test_matrix()
{
#define REPLACEMENT_TEST_MAX_WAY 17 // up to 16 way
#define REPLACEMENT_TEST_ITER 5 // 1 warmup + 4 test
assert(REPLACEMENT_TEST_ITER >= 2);
DEFINE_FLOAT_RESULT_MATRIX(replacement_test,num_way_accessed,REPLACEMENT_TEST_MAX_WAY,iter,REPLACEMENT_TEST_ITER);
FOR(x,REPLACEMENT_TEST_ITER) { replacement_test_column_array[x] = x; }
for (int i = 0; i < REPLACEMENT_TEST_MAX_WAY; i++) {
replacement_test_row_array[i] = i+1;
int warm_up_iter = 64;
int test_iter = i < 4 ? 256 : 64;
replacement_test_result_array[i][0] = test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*(i+1),_PERF_ADDR_STRIDE_L1_SAME_SET,warm_up_iter,0); //warmup
for(int j = 1; j < REPLACEMENT_TEST_ITER; j++) {
replacement_test_result_array[i][j] = test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*(i+1),_PERF_ADDR_STRIDE_L1_SAME_SET,test_iter,0); //test
}
}
print_float_result_matrix(&replacement_test_matrix_meta);
}
\ No newline at end of file
#include "resultmat.h"
void print_float_result_matrix(struct result_matrix_meta* meta)
{
assert(meta);
printf("---------- %s matrix start ----------\n", meta->name);
printf("%s (row) \\ %s (column)\n", meta->row_name, meta->column_name);
if (meta->column_array) {
if (meta->row_array) {
printf("\\ , \t");
}
for (int c = 0; c < meta->column_size; c++) {
printf(" %d,\t", *((int*)meta->column_array + c));
}
printf("\n");
}
for (int r = 0; r < meta->row_size; r++) {
if (meta->row_array) {
printf("%3d,\t", *((int*)meta->row_array + r));
}
for (int c = 0; c < meta->column_size; c++) {
printf("%f,\t", *((float*)meta->result_array + r * meta->column_size + c));
}
printf("\n");
}
printf("---------- %s matrix end ----------\n");
}
void matrix_print_example()
{
DEFINE_FLOAT_RESULT_MATRIX(test,testrow,5,testcol,10);
// ({
// struct result_matrix_meta test_matrix_meta;
// float test_result_array[5][10] = {0};
// int test_column_array[10] = {0};
// int testrow_array[5] = {0};
// test_matrix_meta.name = "test";
// test_matrix_meta.column_name = "testcol";
// test_matrix_meta.row_name = "testrow";
// test_matrix_meta.column_size = 10;
// test_matrix_meta.row_size = 5;
// test_matrix_meta.result_array = test_result_array;
// test_matrix_meta.column_array = test_column_array;
// test_matrix_meta.row_array = test_row_array;
// })
FOR(x,5) { test_row_array[x] = x; }
FOR(x,10) { test_column_array[x] = x; }
FOR(x,5) {
FOR(y,10) {
test_result_array[x][y] = rand();
}
}
print_float_result_matrix(&test_matrix_meta);
}
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册