Last active
May 26, 2026 19:31
-
-
Save Hermann-SW/9162a47fd59216aea77df90efc8e7ad2 to your computer and use it in GitHub Desktop.
Demonstrate multiple TOPS performance for Zen4 AMD CPUs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /* | |
| f=AVX512_VNNI.DP2A_s32_s16_s16 | |
| g++ -O3 -fopenmp -Wall -Wextra -pedantic $f.cpp -o $f | |
| cpplint --filter=-legal/copyright $f.cpp | |
| cppcheck --enable=all --suppress=missingIncludeSystem $f.cpp --check-config | |
| echo off | sudo tee /sys/devices/system/cpu/smt/control | |
| echo 0 | sudo tee /proc/sys/kernel/perf_event_paranoid | |
| perf stat -a -e fp_ops_retired_by_width.pack_512_uops_retired,cycles,instructions,task-clock ./$f | |
| Outputs for "./f": | |
| Performance: 2.23851 TOPS (Tera-Ops/sec) on AMD 8840HS 8C laptop CPU (at 4.400 GHz) | |
| Performance: 5.12171 TOPS (Tera-Ops/sec) on AMD 7950X 16C CPU (at 5.013 GHz) | |
| Outputs for "./f byte": | |
| Performance: 4.17551 TOPS (Tera-Ops/sec) on AMD 8840HS 8C laptop CPU (at 4.116 GHz) | |
| Performance: 10.2413 TOPS (Tera-Ops/sec) on AMD 7950X 16C CPU (at 5.013 GHz) | |
| */ | |
| #include <omp.h> | |
| #include <inttypes.h> | |
| #include <iostream> | |
| #include <chrono> // NOLINT [build/c++11] | |
| int main(int argc, char**) { | |
| const int iterations = 2000000000; // 2*10^9 | |
| const bool byt = (argc > 1); | |
| std::cout << "Starting hardware-bound benchmark using " | |
| << omp_get_max_threads() << " threads...\n"; | |
| auto start_time = std::chrono::high_resolution_clock::now(); | |
| #pragma omp parallel | |
| { | |
| if (byt) { | |
| for (int i = 0; i < iterations; ++i) { | |
| asm __volatile__ ("vpdpbusd %%zmm0, %%zmm0, %%zmm0" : : : "zmm0"); | |
| asm __volatile__ ("vpdpbusd %%zmm1, %%zmm1, %%zmm1" : : : "zmm1"); | |
| asm __volatile__ ("vpdpbusd %%zmm2, %%zmm2, %%zmm2" : : : "zmm2"); | |
| asm __volatile__ ("vpdpbusd %%zmm3, %%zmm3, %%zmm3" : : : "zmm3"); | |
| asm __volatile__ ("vpdpbusd %%zmm4, %%zmm4, %%zmm4" : : : "zmm4"); | |
| } | |
| } else { | |
| for (int i = 0; i < iterations; ++i) { | |
| asm __volatile__ ("vpdpwssd %%zmm0, %%zmm0, %%zmm0" : : : "zmm0"); | |
| asm __volatile__ ("vpdpwssd %%zmm1, %%zmm1, %%zmm1" : : : "zmm1"); | |
| asm __volatile__ ("vpdpwssd %%zmm2, %%zmm2, %%zmm2" : : : "zmm2"); | |
| asm __volatile__ ("vpdpwssd %%zmm3, %%zmm3, %%zmm3" : : : "zmm3"); | |
| asm __volatile__ ("vpdpwssd %%zmm4, %%zmm4, %%zmm4" : : : "zmm4"); | |
| } | |
| } | |
| } | |
| std::cout << "... [AVX512_VNNI] DP2A(s32,s16,s16) completed\n"; | |
| std::cout << | |
| "https://www.officedaytime.com/simd512e/simdimg/vpdpwssd_3.png?v=22\n"; | |
| std::chrono::duration<double> duration = | |
| std::chrono::high_resolution_clock::now() - start_time; | |
| int64_t mult_s16_per_vpdpwssd = 32 * (byt ? 2 : 1); | |
| int64_t add_s32_per_vpdpwssd = (16 + 16) * (byt ? 2 : 1); | |
| int64_t ops_per_vpdpwssd = mult_s16_per_vpdpwssd + add_s32_per_vpdpwssd; | |
| int64_t ops_per_loop = 5 * ops_per_vpdpwssd * omp_get_max_threads(); | |
| int64_t total_ops = ops_per_loop * iterations; | |
| int64_t giga_cnt = total_ops / ops_per_vpdpwssd; | |
| double tera_ops = total_ops / 1e12; | |
| double performance = tera_ops / duration.count(); | |
| std::cout << "-------------------------------------------\n"; | |
| std::cout << "Execution Time: " << duration.count() << " seconds\n"; | |
| std::cout.imbue(std::locale("")); | |
| std::cout << "Counter: " << giga_cnt << "\n"; | |
| std::cout << "Total Compute: " << tera_ops | |
| << " Tera-Operations (counter * (32+16+16) * " | |
| << (byt ? 2 : 1) << ")\n"; | |
| std::cout << "Performance: " << performance << " TOPS (Tera-Ops/sec)\n"; | |
| std::cout << "-------------------------------------------\n"; | |
| return 0; | |
| } |
Author
Author
Verifying reported 5.12171 TOPS with fp_ops_retired_by_width.pack_512_uops_retired hardware counter.
The vast majority of perf counter fp_ops_retired_by_width.pack_512_uops_retired comes from Counter output:
160000000000 * 64 / (1.99933 * 10^12) = 5.12171577478455282519
hermann@7950x:~$ perf stat -a -e fp_ops_retired_by_width.pack_512_uops_retired,cycles,instructions,task-clock ./$f
Starting hardware-bound benchmark using 16 threads...
... [AVX512_VNNI] DP2A(s32,s16,s16) completed
https://www.officedaytime.com/simd512e/simdimg/vpdpwssd_3.png?v=22
-------------------------------------------
Execution Time: 1.99933 seconds
Counter: 160,000,000,000
Total Compute: 10.24 Tera-Operations (counter * (32+16+16))
Performance: 5.12171 TOPS (Tera-Ops/sec)
-------------------------------------------
Performance counter stats for 'system wide':
160,000,038,102 fp_ops_retired_by_width.pack_512_uops_retired # 4.998 G/sec
160,489,568,548 cycles # 5.013 GHz
224,242,131,827 instructions # 1.40 insn per cycle
32,013.03 msec task-clock # 15.999 CPUs utilized
2.000884035 seconds time elapsed
hermann@7950x:~$
Author
Main loop in C++ with inline assembly (allowing control over the 512bit register names):
hermann@8840hs:~$ sed -n "/pragma/,/^ }/p" $f.cpp
#pragma omp parallel
{
for (int i = 0; i < iterations; ++i) {
asm __volatile__ ("vpdpwssd %%zmm0, %%zmm0, %%zmm0" : : : "zmm0");
asm __volatile__ ("vpdpwssd %%zmm1, %%zmm1, %%zmm1" : : : "zmm1");
asm __volatile__ ("vpdpwssd %%zmm2, %%zmm2, %%zmm2" : : : "zmm2");
asm __volatile__ ("vpdpwssd %%zmm3, %%zmm3, %%zmm3" : : : "zmm3");
asm __volatile__ ("vpdpwssd %%zmm4, %%zmm4, %%zmm4" : : : "zmm4");
}
}
hermann@8840hs:~$ Same loop with objdump:
hermann@8840hs:~$ objdump -d AVX512_VNNI.DP2A_s32_s16_s16 | sed -n "/vpdpwssd/,/jne/p"
1580: 62 f2 7d 48 52 c0 vpdpwssd %zmm0,%zmm0,%zmm0
1586: 62 f2 75 48 52 c9 vpdpwssd %zmm1,%zmm1,%zmm1
158c: 62 f2 6d 48 52 d2 vpdpwssd %zmm2,%zmm2,%zmm2
1592: 62 f2 65 48 52 db vpdpwssd %zmm3,%zmm3,%zmm3
1598: 62 f2 5d 48 52 e4 vpdpwssd %zmm4,%zmm4,%zmm4
159e: 83 e8 01 sub $0x1,%eax
15a1: 75 dd jne 1580 <main._omp_fn.0+0x10>
hermann@8840hs:~$
Author
With new "byte" option TOPS numbers double (multiplying 64 byte pairs vs. 32 word pairs):
hermann@7950x:~$ perf stat -a -e fp_ops_retired_by_width.pack_512_uops_retired,cycles,instructions,task-clock ./$f byte
Starting hardware-bound benchmark using 16 threads...
... [AVX512_VNNI] DP2A(s32,s16,s16) completed
https://www.officedaytime.com/simd512e/simdimg/vpdpwssd_3.png?v=22
-------------------------------------------
Execution Time: 1.99975 seconds
Counter: 160,000,000,000
Total Compute: 20.48 Tera-Operations (counter * (32+16+16) * 2)
Performance: 10.2413 TOPS (Tera-Ops/sec)
-------------------------------------------
Performance counter stats for 'system wide':
160,000,038,892 fp_ops_retired_by_width.pack_512_uops_retired # 4.997 G/sec
160,518,614,310 cycles # 5.013 GHz
224,246,479,090 instructions # 1.40 insn per cycle
32,018.54 msec task-clock # 16.000 CPUs utilized
2.001118241 seconds time elapsed
hermann@7950x:~$
hermann@8840hs:~$ perf stat -a -e fp_ops_retired_by_width.pack_512_uops_retired,cycles,instructions,task-clock ./$f byte
Starting hardware-bound benchmark using 8 threads...
... [AVX512_VNNI] DP2A(s32,s16,s16) completed
https://www.officedaytime.com/simd512e/simdimg/vpdpwssd_3.png?v=22
-------------------------------------------
Execution Time: 2.4524 seconds
Counter: 80,000,000,000
Total Compute: 10.24 Tera-Operations (counter * (32+16+16) * 2)
Performance: 4.17551 TOPS (Tera-Ops/sec)
-------------------------------------------
Performance counter stats for 'system wide':
80,001,396,997 fp_ops_retired_by_width.pack_512_uops_retired # 4.073 G/sec
80,848,144,482 cycles # 4.116 GHz
112,521,408,812 instructions # 1.39 insn per cycle
19,641,966,642 task-clock # 7.999 CPUs utilized
2.455592230 seconds time elapsed
hermann@8840hs:~$
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Verifying reported 2.23851 TOPS with fp_ops_retired_by_width.pack_512_uops_retired hardware counter.
One vpdpwssd does 32 s16×s16 multiplications and 16+16 s32 additions:
https://www.officedaytime.com/simd512e/simdimg/vpdpwssd_3.png?v=22