Skip to content

Instantly share code, notes, and snippets.

diff --git a/Makefile b/Makefile
index aff6ccd92c..0482f2aa9f 100644
--- a/Makefile
+++ b/Makefile
@@ -118,7 +118,7 @@ dev-install: dev-install-requires dev-install-triton
.NOPARALLEL: dev-install-llvm
dev-install-llvm:
LLVM_BUILD_PATH=$(LLVM_BUILD_PATH) scripts/build-llvm-project.sh
- TRITON_BUILD_WITH_CLANG_LLD=1 TRITON_BUILD_WITH_CCACHE=0 \
+ TRITON_BUILD_WITH_CLANG_LLD=0 TRITON_BUILD_WITH_CCACHE=0 \
diff --git a/scripts/build-llvm-project.sh b/scripts/build-llvm-project.sh
index e45356d5c3..795fed8149 100755
--- a/scripts/build-llvm-project.sh
+++ b/scripts/build-llvm-project.sh
@@ -18,9 +18,9 @@ if [ -z "$CMAKE_ARGS" ]; then
-DCMAKE_BUILD_TYPE="$LLVM_BUILD_TYPE"
-DLLVM_CCACHE_BUILD=OFF
-DLLVM_ENABLE_ASSERTIONS=ON
- -DCMAKE_C_COMPILER=clang
- -DCMAKE_CXX_COMPILER=clang++
diff --git a/scripts/build-llvm-project.sh b/scripts/build-llvm-project.sh
index e45356d5c3..dc65139895 100755
--- a/scripts/build-llvm-project.sh
+++ b/scripts/build-llvm-project.sh
@@ -18,8 +18,8 @@ if [ -z "$CMAKE_ARGS" ]; then
-DCMAKE_BUILD_TYPE="$LLVM_BUILD_TYPE"
-DLLVM_CCACHE_BUILD=OFF
-DLLVM_ENABLE_ASSERTIONS=ON
- -DCMAKE_C_COMPILER=clang
- -DCMAKE_CXX_COMPILER=clang++
This file has been truncated, but you can view the full file.
{
"schemaVersion": 1,
"deviceProperties": [
{
"id": 0, "name": "NVIDIA B200", "totalGlobalMem": 191503138816,
"computeMajor": 10, "computeMinor": 0,
"maxThreadsPerBlock": 1024, "maxThreadsPerMultiprocessor": 2048,
"regsPerBlock": 65536, "warpSize": 32,
"sharedMemPerBlock": 49152, "numSms": 148
, "regsPerMultiprocessor": 65536, "sharedMemPerBlockOptin": 232448, "sharedMemPerMultiprocessor": 233472
This file has been truncated, but you can view the full file.
{
"schemaVersion": 1,
"deviceProperties": [
{
"id": 0, "name": "NVIDIA B200", "totalGlobalMem": 191503138816,
"computeMajor": 10, "computeMinor": 0,
"maxThreadsPerBlock": 1024, "maxThreadsPerMultiprocessor": 2048,
"regsPerBlock": 65536, "warpSize": 32,
"sharedMemPerBlock": 49152, "numSms": 148
, "regsPerMultiprocessor": 65536, "sharedMemPerBlockOptin": 232448, "sharedMemPerMultiprocessor": 233472
[
{
"models": [
"BAAI/bge-base-en-v1.5",
"BAAI/bge-large-en-v1.5",
"BAAI/bge-small-en-v1.5",
"FinLang/finance-embeddings-investopedia",
"ProsusAI/finbert",
"Xenova/all-MiniLM-L6-v2",
"colbert-ir/colbertv2.0",
diff --git a/fbcode/helion/helion/__init__.py b/fbcode/helion/helion/__init__.py
--- a/fbcode/helion/helion/__init__.py
+++ b/fbcode/helion/helion/__init__.py
@@ -34,3 +34,6 @@
from ._compiler._dynamo.variables import register_dynamo_variable # noqa: E402
register_dynamo_variable()
+
+import torch
+torch.cuda.memory._record_memory_history(max_entries=100000)
Inductor:
L_x_user: "f32[4096, 759, 128][97152, 128, 1]cuda:0",
L_self_modules_maybe_fused_lce_modules_lces_modules_0_modules_fc_modules_linear_shared_parameters_weight_: "f32[192, 759][759, 1]cuda:0"
l_self_modules_maybe_fused_lce_modules_lces_modules_0_modules_fc_modules_linear_shared_parameters_weight_ = L_self_modules_maybe_fused_lce_modules_lces_modules_0_modules_fc_modules_linear_shared_parameters_weight_
l_x_user = L_x_user
matmul: "bf16[4096, 192, 128][24576, 128, 1]cuda:0" = l_self_modules_maybe_fused_lce_modules_lces_modules_0_modules_fc_modules_linear_shared_parameters_weight_ @ l_x_user; l_self_modules_maybe_fused_lce_modules_lces_modules_0_modules_fc_modules_linear_shared_parameters_weight_ = None
Helion
l_self_modules_maybe_fused_lce_modules_lces_modules_0_modules_fc_modules_linear_shared_parameters_weight_ = L_self_modules_maybe_fused_lce_modules_lces_modules_0_modules_fc_modules_linear_shared_parameters_weight_
L_self_modules_maybe_fuse
--- /tmp/inductor.py 2026-04-30 10:22:24.247513427 -0700
+++ /tmp/helion.py 2026-04-30 10:22:40.475653971 -0700
@@ -1,8 +1,8 @@
class GraphModule(torch.nn.Module):
- def forward(self, L_x_ads: "f32[4096, 523, 128][66944, 128, 1]cuda:0", L_self_modules_maybe_fused_lce_modules_lces_modules_0_modules_fc_modules_linear_shared_parameters_weight_: "f32[192, 759][759, 1]cuda:0", L_x_user: "f32[4096, 759, 128][97152, 128, 1]cuda:0", L_x_ads_to_user_map: "i64[4096][1]cuda:0", L_self_modules_maybe_fused_lce_modules_lces_modules_0_modules_fc_modules_linear_ads_parameters_weight_: "f32[96, 523][523, 1]cuda:0", L_self_modules_maybe_fused_lce_modules_lces_modules_1_modules_fc_modules_linear_shared_parameters_weight_: "f32[32, 759][759, 1]cuda:0", L_self_modules_maybe_fused_lce_modules_lces_modules_1_modules_fc_modules_linear_ads_parameters_weight_: "f32[16, 523][523, 1]cuda:0", L_self_modules_maybe_fused_lce_modules_lces_modules_2_modules_fc_modules_linear_shared_parameters_weight_: "f32[192, 759][759, 1]cuda:0", L_sel
class GraphModule(torch.nn.Module):
def forward(self, L_input_embs_: "f32[4096, 523, 128][66944, 128, 1]cuda:0", L_self_modules_spec_projection_parameters_weight_: "f32[2048, 228][228, 1]cuda:0", L_self_modules_spec_projection_parameters_bias_: "f32[2048][1]cuda:0", L_raw_input_features_spec_embs_: "bf16[4096, 228][228, 1]cuda:0", L_self_modules_spec_projection_user_parameters_weight_: "f32[2048, 168][168, 1]cuda:0", L_self_modules_spec_projection_user_parameters_bias_: "f32[2048][1]cuda:0", L_raw_input_features_spec_embs_user_: "bf16[4096, 168][168, 1]cuda:0", L_raw_input_features_input_embs_user_: "f32[4096, 759, 128][97152, 128, 1]cuda:0", L_raw_input_features_dense_projection_: "f32[4096, 2048][2048, 1]cuda:0", L_raw_input_features_dense_projection_user_: "f32[4096, 2048][2048, 1]cuda:0", L_self_modules_wukong_modules_ln_modules_ln_user_parameters_weight_: "f32[128][1]cuda:0", L_self_modules_wukong_modules_ln_modules_ln_ads_parameters_weight_: "f32[128][1]cuda:0", L_self_modules_wukong_modules_ln2_mod