May 27, 2026 18:09 · May 27, 2026 17:51 · May 27, 2026 17:29 · May 24, 2026 01:14 · May 24, 2026 01:09 · May 20, 2026 18:42
 diff --git a/Makefile b/Makefile
 index aff6ccd92c..0482f2aa9f 100644
 --- a/Makefile
 +++ b/Makefile
 @@ -118,7 +118,7 @@ dev-install: dev-install-requires dev-install-triton
 .NOPARALLEL: dev-install-llvm
 dev-install-llvm:
 	LLVM_BUILD_PATH=$(LLVM_BUILD_PATH) scripts/build-llvm-project.sh
 -	TRITON_BUILD_WITH_CLANG_LLD=1 TRITON_BUILD_WITH_CCACHE=0 \
 +	TRITON_BUILD_WITH_CLANG_LLD=0 TRITON_BUILD_WITH_CCACHE=0 \
 diff --git a/scripts/build-llvm-project.sh b/scripts/build-llvm-project.sh
 index e45356d5c3..795fed8149 100755
 --- a/scripts/build-llvm-project.sh
 +++ b/scripts/build-llvm-project.sh
 @@ -18,9 +18,9 @@ if [ -z "$CMAKE_ARGS" ]; then
               -DCMAKE_BUILD_TYPE="$LLVM_BUILD_TYPE"
               -DLLVM_CCACHE_BUILD=OFF
               -DLLVM_ENABLE_ASSERTIONS=ON
 -              -DCMAKE_C_COMPILER=clang
 -              -DCMAKE_CXX_COMPILER=clang++
 diff --git a/scripts/build-llvm-project.sh b/scripts/build-llvm-project.sh
 index e45356d5c3..dc65139895 100755
 --- a/scripts/build-llvm-project.sh
 +++ b/scripts/build-llvm-project.sh
 @@ -18,8 +18,8 @@ if [ -z "$CMAKE_ARGS" ]; then
               -DCMAKE_BUILD_TYPE="$LLVM_BUILD_TYPE"
               -DLLVM_CCACHE_BUILD=OFF
               -DLLVM_ENABLE_ASSERTIONS=ON
 -              -DCMAKE_C_COMPILER=clang
 -              -DCMAKE_CXX_COMPILER=clang++
 {
    "schemaVersion": 1,
    "deviceProperties": [
    {
      "id": 0, "name": "NVIDIA B200", "totalGlobalMem": 191503138816,
      "computeMajor": 10, "computeMinor": 0,
      "maxThreadsPerBlock": 1024, "maxThreadsPerMultiprocessor": 2048,
      "regsPerBlock": 65536, "warpSize": 32,
      "sharedMemPerBlock": 49152, "numSms": 148
    , "regsPerMultiprocessor": 65536, "sharedMemPerBlockOptin": 232448, "sharedMemPerMultiprocessor": 233472
 {
    "schemaVersion": 1,
    "deviceProperties": [
    {
      "id": 0, "name": "NVIDIA B200", "totalGlobalMem": 191503138816,
      "computeMajor": 10, "computeMinor": 0,
      "maxThreadsPerBlock": 1024, "maxThreadsPerMultiprocessor": 2048,
      "regsPerBlock": 65536, "warpSize": 32,
      "sharedMemPerBlock": 49152, "numSms": 148
    , "regsPerMultiprocessor": 65536, "sharedMemPerBlockOptin": 232448, "sharedMemPerMultiprocessor": 233472
 [
  {
    "models": [
      "BAAI/bge-base-en-v1.5",
      "BAAI/bge-large-en-v1.5",
      "BAAI/bge-small-en-v1.5",
      "FinLang/finance-embeddings-investopedia",
      "ProsusAI/finbert",
      "Xenova/all-MiniLM-L6-v2",
      "colbert-ir/colbertv2.0",
 diff --git a/fbcode/helion/helion/__init__.py b/fbcode/helion/helion/__init__.py
 --- a/fbcode/helion/helion/__init__.py
 +++ b/fbcode/helion/helion/__init__.py
 @@ -34,3 +34,6 @@
 from ._compiler._dynamo.variables import register_dynamo_variable  # noqa: E402
 
 register_dynamo_variable()
 +
 +import torch
 +torch.cuda.memory._record_memory_history(max_entries=100000)
 Inductor:
        L_x_user: "f32[4096, 759, 128][97152, 128, 1]cuda:0",
        L_self_modules_maybe_fused_lce_modules_lces_modules_0_modules_fc_modules_linear_shared_parameters_weight_: "f32[192, 759][759, 1]cuda:0"
        l_self_modules_maybe_fused_lce_modules_lces_modules_0_modules_fc_modules_linear_shared_parameters_weight_ = L_self_modules_maybe_fused_lce_modules_lces_modules_0_modules_fc_modules_linear_shared_parameters_weight_
        l_x_user = L_x_user
        matmul: "bf16[4096, 192, 128][24576, 128, 1]cuda:0" = l_self_modules_maybe_fused_lce_modules_lces_modules_0_modules_fc_modules_linear_shared_parameters_weight_ @ l_x_user;  l_self_modules_maybe_fused_lce_modules_lces_modules_0_modules_fc_modules_linear_shared_parameters_weight_ = None

 Helion
        l_self_modules_maybe_fused_lce_modules_lces_modules_0_modules_fc_modules_linear_shared_parameters_weight_ = L_self_modules_maybe_fused_lce_modules_lces_modules_0_modules_fc_modules_linear_shared_parameters_weight_
        L_self_modules_maybe_fuse
 --- /tmp/inductor.py	2026-04-30 10:22:24.247513427 -0700
 +++ /tmp/helion.py	2026-04-30 10:22:40.475653971 -0700
 @@ -1,8 +1,8 @@
 class GraphModule(torch.nn.Module):
 -    def forward(self, L_x_ads: "f32[4096, 523, 128][66944, 128, 1]cuda:0", L_self_modules_maybe_fused_lce_modules_lces_modules_0_modules_fc_modules_linear_shared_parameters_weight_: "f32[192, 759][759, 1]cuda:0", L_x_user: "f32[4096, 759, 128][97152, 128, 1]cuda:0", L_x_ads_to_user_map: "i64[4096][1]cuda:0", L_self_modules_maybe_fused_lce_modules_lces_modules_0_modules_fc_modules_linear_ads_parameters_weight_: "f32[96, 523][523, 1]cuda:0", L_self_modules_maybe_fused_lce_modules_lces_modules_1_modules_fc_modules_linear_shared_parameters_weight_: "f32[32, 759][759, 1]cuda:0", L_self_modules_maybe_fused_lce_modules_lces_modules_1_modules_fc_modules_linear_ads_parameters_weight_: "f32[16, 523][523, 1]cuda:0", L_self_modules_maybe_fused_lce_modules_lces_modules_2_modules_fc_modules_linear_shared_parameters_weight_: "f32[192, 759][759, 1]cuda:0", L_sel
 class GraphModule(torch.nn.Module):
    def forward(self, L_input_embs_: "f32[4096, 523, 128][66944, 128, 1]cuda:0", L_self_modules_spec_projection_parameters_weight_: "f32[2048, 228][228, 1]cuda:0", L_self_modules_spec_projection_parameters_bias_: "f32[2048][1]cuda:0", L_raw_input_features_spec_embs_: "bf16[4096, 228][228, 1]cuda:0", L_self_modules_spec_projection_user_parameters_weight_: "f32[2048, 168][168, 1]cuda:0", L_self_modules_spec_projection_user_parameters_bias_: "f32[2048][1]cuda:0", L_raw_input_features_spec_embs_user_: "bf16[4096, 168][168, 1]cuda:0", L_raw_input_features_input_embs_user_: "f32[4096, 759, 128][97152, 128, 1]cuda:0", L_raw_input_features_dense_projection_: "f32[4096, 2048][2048, 1]cuda:0", L_raw_input_features_dense_projection_user_: "f32[4096, 2048][2048, 1]cuda:0", L_self_modules_wukong_modules_ln_modules_ln_user_parameters_weight_: "f32[128][1]cuda:0", L_self_modules_wukong_modules_ln_modules_ln_ads_parameters_weight_: "f32[128][1]cuda:0", L_self_modules_wukong_modules_ln2_mod
	diff --git a/Makefile b/Makefile
	index aff6ccd92c..0482f2aa9f 100644
	--- a/Makefile
	+++ b/Makefile
	@@ -118,7 +118,7 @@ dev-install: dev-install-requires dev-install-triton
	.NOPARALLEL: dev-install-llvm
	dev-install-llvm:
	LLVM_BUILD_PATH=$(LLVM_BUILD_PATH) scripts/build-llvm-project.sh
	- TRITON_BUILD_WITH_CLANG_LLD=1 TRITON_BUILD_WITH_CCACHE=0 \
	+ TRITON_BUILD_WITH_CLANG_LLD=0 TRITON_BUILD_WITH_CCACHE=0 \
	diff --git a/scripts/build-llvm-project.sh b/scripts/build-llvm-project.sh
	index e45356d5c3..795fed8149 100755
	--- a/scripts/build-llvm-project.sh
	+++ b/scripts/build-llvm-project.sh
	@@ -18,9 +18,9 @@ if [ -z "$CMAKE_ARGS" ]; then
	-DCMAKE_BUILD_TYPE="$LLVM_BUILD_TYPE"
	-DLLVM_CCACHE_BUILD=OFF
	-DLLVM_ENABLE_ASSERTIONS=ON
	- -DCMAKE_C_COMPILER=clang
	- -DCMAKE_CXX_COMPILER=clang++
	diff --git a/scripts/build-llvm-project.sh b/scripts/build-llvm-project.sh
	index e45356d5c3..dc65139895 100755
	--- a/scripts/build-llvm-project.sh
	+++ b/scripts/build-llvm-project.sh
	@@ -18,8 +18,8 @@ if [ -z "$CMAKE_ARGS" ]; then
	-DCMAKE_BUILD_TYPE="$LLVM_BUILD_TYPE"
	-DLLVM_CCACHE_BUILD=OFF
	-DLLVM_ENABLE_ASSERTIONS=ON
	- -DCMAKE_C_COMPILER=clang
	- -DCMAKE_CXX_COMPILER=clang++
	{
	"schemaVersion": 1,
	"deviceProperties": [
	{
	"id": 0, "name": "NVIDIA B200", "totalGlobalMem": 191503138816,
	"computeMajor": 10, "computeMinor": 0,
	"maxThreadsPerBlock": 1024, "maxThreadsPerMultiprocessor": 2048,
	"regsPerBlock": 65536, "warpSize": 32,
	"sharedMemPerBlock": 49152, "numSms": 148
	, "regsPerMultiprocessor": 65536, "sharedMemPerBlockOptin": 232448, "sharedMemPerMultiprocessor": 233472
	[
	{
	"models": [
	"BAAI/bge-base-en-v1.5",
	"BAAI/bge-large-en-v1.5",
	"BAAI/bge-small-en-v1.5",
	"FinLang/finance-embeddings-investopedia",
	"ProsusAI/finbert",
	"Xenova/all-MiniLM-L6-v2",
	"colbert-ir/colbertv2.0",
	diff --git a/fbcode/helion/helion/__init__.py b/fbcode/helion/helion/__init__.py
	--- a/fbcode/helion/helion/__init__.py
	+++ b/fbcode/helion/helion/__init__.py
	@@ -34,3 +34,6 @@
	from ._compiler._dynamo.variables import register_dynamo_variable # noqa: E402

	register_dynamo_variable()
	+
	+import torch
	+torch.cuda.memory._record_memory_history(max_entries=100000)
	Inductor:
	L_x_user: "f32[4096, 759, 128][97152, 128, 1]cuda:0",
	L_self_modules_maybe_fused_lce_modules_lces_modules_0_modules_fc_modules_linear_shared_parameters_weight_: "f32[192, 759][759, 1]cuda:0"
	l_self_modules_maybe_fused_lce_modules_lces_modules_0_modules_fc_modules_linear_shared_parameters_weight_ = L_self_modules_maybe_fused_lce_modules_lces_modules_0_modules_fc_modules_linear_shared_parameters_weight_
	l_x_user = L_x_user
	matmul: "bf16[4096, 192, 128][24576, 128, 1]cuda:0" = l_self_modules_maybe_fused_lce_modules_lces_modules_0_modules_fc_modules_linear_shared_parameters_weight_ @ l_x_user; l_self_modules_maybe_fused_lce_modules_lces_modules_0_modules_fc_modules_linear_shared_parameters_weight_ = None

	Helion
	l_self_modules_maybe_fused_lce_modules_lces_modules_0_modules_fc_modules_linear_shared_parameters_weight_ = L_self_modules_maybe_fused_lce_modules_lces_modules_0_modules_fc_modules_linear_shared_parameters_weight_
	L_self_modules_maybe_fuse
	--- /tmp/inductor.py 2026-04-30 10:22:24.247513427 -0700
	+++ /tmp/helion.py 2026-04-30 10:22:40.475653971 -0700
	@@ -1,8 +1,8 @@
	class GraphModule(torch.nn.Module):
	- def forward(self, L_x_ads: "f32[4096, 523, 128][66944, 128, 1]cuda:0", L_self_modules_maybe_fused_lce_modules_lces_modules_0_modules_fc_modules_linear_shared_parameters_weight_: "f32[192, 759][759, 1]cuda:0", L_x_user: "f32[4096, 759, 128][97152, 128, 1]cuda:0", L_x_ads_to_user_map: "i64[4096][1]cuda:0", L_self_modules_maybe_fused_lce_modules_lces_modules_0_modules_fc_modules_linear_ads_parameters_weight_: "f32[96, 523][523, 1]cuda:0", L_self_modules_maybe_fused_lce_modules_lces_modules_1_modules_fc_modules_linear_shared_parameters_weight_: "f32[32, 759][759, 1]cuda:0", L_self_modules_maybe_fused_lce_modules_lces_modules_1_modules_fc_modules_linear_ads_parameters_weight_: "f32[16, 523][523, 1]cuda:0", L_self_modules_maybe_fused_lce_modules_lces_modules_2_modules_fc_modules_linear_shared_parameters_weight_: "f32[192, 759][759, 1]cuda:0", L_sel