1- /*
2- * Microbench testies for MLP and memory latency in CXLMS
3- *
4- * By: Andrew Quinn
5- * Yiwei Yang
6- *
7- * Copyright 2023 Regents of the Univeristy of California
8- * UC Santa Cruz Sluglab.
9- */
10-
11-
121#include < errno.h>
132#include < stdio.h>
143#include < assert.h>
187#include < cpuid.h>
198#include < pthread.h>
209#include < stdlib.h>
21-
2210#include < sys/mman.h>
2311
24-
2512#define STR_HELPER (x ) #x
2613#define STR (x ) STR_HELPER(x)
27-
2814#define MOVE_SIZE 128
2915#define MAP_SIZE (long )(1024 * 1024 * 1024 )
3016#define CACHELINE_SIZE 64
31-
3217#ifndef FENCE_COUNT
3318#define FENCE_COUNT 8
3419#endif
35-
3620#define FENCE_BOUND (FENCE_COUNT * MOVE_SIZE)
3721
38- // we need to jump in MOVE_SIZE increments otherwise segfault!
39-
40- #define BODY (start ) \
41- " xor %%r8, %%r8 \n " \
42- " pxor %%xmm1, %%xmm1 \n " \
43- " LOOP_START%=: \n " \
44- " lea (%[" #start " ], %%r8), %%r9 \n " \
45- " movdqa %%xmm1, (%%r9) \n " \
46- " add $" STR(MOVE_SIZE) " , %%r8 \n " \
47- " cmp $" STR(FENCE_BOUND) " ,%%r8\n " \
48- " jl LOOP_START%= \n " \
49- " clflush (%%r9) \n " \
50- " mfence \n "
51-
22+ // 修改的BODY宏,去除所有fence指令
23+ #define BODY (start ) \
24+ " xor %%r8, %%r8 \n " \
25+ " pxor %%xmm1, %%xmm1 \n " \
26+ " LOOP_START%=: \n " \
27+ " lea (%[" #start " ], %%r8), %%r9 \n " \
28+ " movdqa %%xmm1, (%%r9) \n " \
29+ " add $" STR(MOVE_SIZE) " , %%r8 \n " \
30+ " cmp $" STR(FENCE_BOUND) " ,%%r8\n " \
31+ " jl LOOP_START%= \n " \
32+ " mov $0, %%eax \n " \
33+ " cpuid \n " // 使用cpuid作为序列点替代内存屏障
5234
5335int main (int argc, char **argv) {
54-
55- // in principle, you would want to clear out cache lines (and the
56- // pipeline) before doing any of the inline assembly stuff. But,
57- // that's hard. And, its probably noise when you execute over
58- // enough things.
59-
60-
61- // allocate some meomery
62- char *base =(char *) mmap (nullptr ,
63- MAP_SIZE,
64- PROT_READ | PROT_WRITE,
65- MAP_ANONYMOUS | MAP_PRIVATE,
66- -1 ,
67- 0 );
68-
36+ char *base = (char *) mmap (nullptr ,
37+ MAP_SIZE,
38+ PROT_READ | PROT_WRITE,
39+ MAP_ANONYMOUS | MAP_PRIVATE,
40+ -1 ,
41+ 0 );
6942 if (base == MAP_FAILED) {
7043 fprintf (stderr, " oops, you suck %d\n " , errno);
7144 return -1 ;
7245 }
73- char *addr = NULL ;
7446
47+ char *addr = NULL ;
7548 intptr_t *iaddr = (intptr_t *) base;
7649 intptr_t hash = 0 ;
7750 struct timespec tstart = {0 ,0 }, tend = {0 ,0 };
7851
79- // Necessary so that we don't include allocation costs in our benchmark
52+ // 填充内存以确保页面分配
8053 while (iaddr < (intptr_t *)(base + MAP_SIZE)) {
8154 hash = hash ^ (intptr_t ) iaddr;
8255 *iaddr = hash;
8356 iaddr++;
8457 }
8558
86- // should flush everything from the cache. But, how big is the cache?
87- addr = base;
88- while (addr < (base + MAP_SIZE)) {
89- asm volatile (
90- " mov %[buf], %%rsi\n "
91- " clflush (%%rsi)\n "
92- :
93- : [buf] " r" (addr)
94- : " rsi" );
95- addr += CACHELINE_SIZE;
59+ // 清除缓存的替代方案:访问比缓存大的内存区域
60+ size_t cache_clear_size = 32 * 1024 * 1024 ; // 大于典型的L3缓存
61+ char *cache_clear = (char *)malloc (cache_clear_size);
62+ if (cache_clear) {
63+ volatile char temp = 0 ;
64+ // 使用循环方式访问内存,驱逐之前的缓存内容
65+ for (size_t i = 0 ; i < cache_clear_size; i += CACHELINE_SIZE) {
66+ cache_clear[i] = (char )i;
67+ temp += cache_clear[i]; // 确保访问不被优化掉
68+ }
69+ free (cache_clear);
9670 }
9771
98- asm volatile (" mfence\n " :::);
72+ // 使用cpuid指令作为序列点
73+ unsigned int eax, ebx, ecx, edx;
74+ __cpuid (0 , eax, ebx, ecx, edx);
9975
10076 clock_gettime (CLOCK_MONOTONIC, &tstart);
10177 addr = base;
78+
10279 while (addr < (base + MAP_SIZE)) {
103- // fprintf (stderr, "addr %p bound %p\n", addr, base + MAP_SIZE);
10480 asm volatile (
105- BODY (addr)
106- :
107- : [addr] " r" (addr)
108- : " r8" , " r9" , " xmm0" );
109-
110- addr += (FENCE_COUNT * MOVE_SIZE);
81+ BODY (addr)
82+ :
83+ : [addr] " r" (addr)
84+ : " rax" , " rbx" , " rcx" , " rdx" , " r8" , " r9" , " xmm0" , " xmm1" , " memory" );
85+ addr += (FENCE_COUNT * MOVE_SIZE);
11186 }
87+
11288 clock_gettime (CLOCK_MONOTONIC, &tend);
113- uint64_t nanos = (1000000000 * tend.tv_sec + tend.tv_nsec );
89+ uint64_t nanos = (1000000000 * tend.tv_sec + tend.tv_nsec );
11490 nanos -= (1000000000 * tstart.tv_sec + tstart.tv_nsec );
115-
116-
11791 printf (" %lu\n " , nanos);
118- return 0 ;
119- }
12092
93+ munmap (base, MAP_SIZE);
94+ return 0 ;
95+ }
0 commit comments