Created
April 24, 2026 23:15
-
-
Save shunting314/db1bc29def88b138e7500df1f9357b3e to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| class GraphModule(torch.nn.Module): | |
| def forward(self, L_input_embs_: "f32[4096, 523, 128][66944, 128, 1]cuda:0", L_self_modules_spec_projection_parameters_weight_: "f32[2048, 228][228, 1]cuda:0", L_self_modules_spec_projection_parameters_bias_: "f32[2048][1]cuda:0", L_raw_input_features_spec_embs_: "bf16[4096, 228][228, 1]cuda:0", L_self_modules_spec_projection_user_parameters_weight_: "f32[2048, 168][168, 1]cuda:0", L_self_modules_spec_projection_user_parameters_bias_: "f32[2048][1]cuda:0", L_raw_input_features_spec_embs_user_: "bf16[4096, 168][168, 1]cuda:0", L_raw_input_features_input_embs_user_: "f32[4096, 759, 128][97152, 128, 1]cuda:0", L_raw_input_features_dense_projection_: "f32[4096, 2048][2048, 1]cuda:0", L_raw_input_features_dense_projection_user_: "f32[4096, 2048][2048, 1]cuda:0", L_self_modules_wukong_modules_ln_modules_ln_user_parameters_weight_: "f32[128][1]cuda:0", L_self_modules_wukong_modules_ln_modules_ln_ads_parameters_weight_: "f32[128][1]cuda:0", L_self_modules_wukong_modules_ln2_modules_ln_user_parameters_weight_: "f32[4096][1]cuda:0", L_self_modules_wukong_modules_ln2_modules_ln_ads_parameters_weight_: "f32[4096][1]cuda:0", L_self_modules_wukong_modules_layers_modules_0_modules_maybe_fused_lce_modules_lces_modules_0_modules_fc_modules_linear_shared_parameters_weight_: "f32[192, 759][759, 1]cuda:0", L_self_modules_wukong_modules_layers_modules_0_modules_maybe_fused_lce_modules_lces_modules_0_modules_fc_modules_linear_ads_parameters_weight_: "f32[96, 523][523, 1]cuda:0", L_self_modules_wukong_modules_layers_modules_0_modules_maybe_fused_lce_modules_lces_modules_1_modules_fc_modules_linear_shared_parameters_weight_: "f32[32, 759][759, 1]cuda:0", L_self_modules_wukong_modules_layers_modules_0_modules_maybe_fused_lce_modules_lces_modules_1_modules_fc_modules_linear_ads_parameters_weight_: "f32[16, 523][523, 1]cuda:0", L_self_modules_wukong_modules_layers_modules_0_modules_maybe_fused_lce_modules_lces_modules_2_modules_fc_modules_linear_shared_parameters_weight_: "f32[192, 759][759, 1]cuda:0", L_self_modules_wukong_modules_layers_modules_0_modules_maybe_fused_lce_modules_lces_modules_2_modules_fc_modules_linear_ads_parameters_weight_: "f32[96, 523][523, 1]cuda:0", L_self_modules_wukong_modules_layers_modules_0_modules_bitattn_modules_v_proj_modules_fcs_modules_0_modules_linear_shared_parameters_weight_: "f32[512, 2048][2048, 1]cuda:0", L_self_modules_wukong_modules_layers_modules_0_modules_bitattn_modules_v_proj_modules_fcs_modules_0_modules_linear_ads_parameters_weight_: "f32[256, 2048][2048, 1]cuda:0", L_self_modules_wukong_modules_layers_modules_0_modules_bitattn_modules_v_proj_modules_acts_modules_0_modules_activation_modules_ln_user_modules_ln_parameters_weight_: "f32[256][1]cuda:0", L_self_modules_wukong_modules_layers_modules_0_modules_bitattn_modules_v_proj_modules_acts_modules_0_modules_activation_modules_ln_ads_modules_ln_parameters_weight_: "f32[256][1]cuda:0", L_self_modules_wukong_modules_layers_modules_0_modules_bitattn_modules_v_proj_modules_fcs_modules_1_modules_linear_shared_parameters_weight_: "f32[512, 256][256, 1]cuda:0", L_self_modules_wukong_modules_layers_modules_0_modules_bitattn_modules_v_proj_modules_fcs_modules_1_modules_linear_ads_parameters_weight_: "f32[256, 256][256, 1]cuda:0", L_self_modules_wukong_modules_layers_modules_0_modules_bitattn_modules_v_proj_modules_acts_modules_1_modules_activation_modules_ln_user_modules_ln_parameters_weight_: "f32[256][1]cuda:0", L_self_modules_wukong_modules_layers_modules_0_modules_bitattn_modules_v_proj_modules_acts_modules_1_modules_activation_modules_ln_ads_modules_ln_parameters_weight_: "f32[256][1]cuda:0", L_self_modules_wukong_modules_layers_modules_0_modules_bitattn_modules_v_proj_modules_output_fc_modules_linear_shared_parameters_weight_: "f32[41024, 256][256, 1]cuda:0", L_self_modules_wukong_modules_layers_modules_0_modules_bitattn_modules_v_proj_modules_output_fc_modules_linear_ads_parameters_weight_: "f32[20512, 256][256, 1]cuda:0", L_self_modules_wukong_modules_layers_modules_0_modules_ln1_modules_ln_user_parameters_weight_: "f32[16240][1]cuda:0", L_self_mo | |
| l_input_embs_ = L_input_embs_ | |
| l_self_modules_spec_projection_parameters_weight_ = L_self_modules_spec_projection_parameters_weight_ | |
| l_self_modules_spec_projection_parameters_bias_ = L_self_modules_spec_projection_parameters_bias_ | |
| l_raw_input_features_spec_embs_ = L_raw_input_features_spec_embs_ | |
| l_self_modules_spec_projection_user_parameters_weight_ = L_self_modules_spec_projection_user_parameters_weight_ | |
| l_self_modules_spec_projection_user_parameters_bias_ = L_self_modules_spec_projection_user_parameters_bias_ | |
| l_raw_input_features_spec_embs_user_ = L_raw_input_features_spec_embs_user_ | |
| l_raw_input_features_input_embs_user_ = L_raw_input_features_input_embs_user_ | |
| l_raw_input_features_dense_projection_ = L_raw_input_features_dense_projection_ | |
| l_raw_input_features_dense_projection_user_ = L_raw_input_features_dense_projection_user_ | |
| l_self_modules_wukong_modules_ln_modules_ln_user_parameters_weight_ = L_self_modules_wukong_modules_ln_modules_ln_user_parameters_weight_ | |
| l_self_modules_wukong_modules_ln_modules_ln_ads_parameters_weight_ = L_self_modules_wukong_modules_ln_modules_ln_ads_parameters_weight_ | |
| l_self_modules_wukong_modules_ln2_modules_ln_user_parameters_weight_ = L_self_modules_wukong_modules_ln2_modules_ln_user_parameters_weight_ | |
| l_self_modules_wukong_modules_ln2_modules_ln_ads_parameters_weight_ = L_self_modules_wukong_modules_ln2_modules_ln_ads_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_0_modules_maybe_fused_lce_modules_lces_modules_0_modules_fc_modules_linear_shared_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_0_modules_maybe_fused_lce_modules_lces_modules_0_modules_fc_modules_linear_shared_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_0_modules_maybe_fused_lce_modules_lces_modules_0_modules_fc_modules_linear_ads_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_0_modules_maybe_fused_lce_modules_lces_modules_0_modules_fc_modules_linear_ads_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_0_modules_maybe_fused_lce_modules_lces_modules_1_modules_fc_modules_linear_shared_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_0_modules_maybe_fused_lce_modules_lces_modules_1_modules_fc_modules_linear_shared_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_0_modules_maybe_fused_lce_modules_lces_modules_1_modules_fc_modules_linear_ads_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_0_modules_maybe_fused_lce_modules_lces_modules_1_modules_fc_modules_linear_ads_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_0_modules_maybe_fused_lce_modules_lces_modules_2_modules_fc_modules_linear_shared_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_0_modules_maybe_fused_lce_modules_lces_modules_2_modules_fc_modules_linear_shared_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_0_modules_maybe_fused_lce_modules_lces_modules_2_modules_fc_modules_linear_ads_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_0_modules_maybe_fused_lce_modules_lces_modules_2_modules_fc_modules_linear_ads_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_0_modules_bitattn_modules_v_proj_modules_fcs_modules_0_modules_linear_shared_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_0_modules_bitattn_modules_v_proj_modules_fcs_modules_0_modules_linear_shared_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_0_modules_bitattn_modules_v_proj_modules_fcs_modules_0_modules_linear_ads_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_0_modules_bitattn_modules_v_proj_modules_fcs_modules_0_modules_linear_ads_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_0_modules_bitattn_modules_v_proj_modules_acts_modules_0_modules_activation_modules_ln_user_modules_ln_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_0_modules_bitattn_modules_v_proj_modules_acts_modules_0_modules_activation_modules_ln_user_modules_ln_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_0_modules_bitattn_modules_v_proj_modules_acts_modules_0_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_0_modules_bitattn_modules_v_proj_modules_acts_modules_0_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_0_modules_bitattn_modules_v_proj_modules_fcs_modules_1_modules_linear_shared_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_0_modules_bitattn_modules_v_proj_modules_fcs_modules_1_modules_linear_shared_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_0_modules_bitattn_modules_v_proj_modules_fcs_modules_1_modules_linear_ads_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_0_modules_bitattn_modules_v_proj_modules_fcs_modules_1_modules_linear_ads_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_0_modules_bitattn_modules_v_proj_modules_acts_modules_1_modules_activation_modules_ln_user_modules_ln_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_0_modules_bitattn_modules_v_proj_modules_acts_modules_1_modules_activation_modules_ln_user_modules_ln_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_0_modules_bitattn_modules_v_proj_modules_acts_modules_1_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_0_modules_bitattn_modules_v_proj_modules_acts_modules_1_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_0_modules_bitattn_modules_v_proj_modules_output_fc_modules_linear_shared_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_0_modules_bitattn_modules_v_proj_modules_output_fc_modules_linear_shared_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_0_modules_bitattn_modules_v_proj_modules_output_fc_modules_linear_ads_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_0_modules_bitattn_modules_v_proj_modules_output_fc_modules_linear_ads_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_0_modules_ln1_modules_ln_user_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_0_modules_ln1_modules_ln_user_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_0_modules_ln1_modules_ln_ads_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_0_modules_ln1_modules_ln_ads_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_fcs_modules_0_modules_linear_shared_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_fcs_modules_0_modules_linear_shared_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_fcs_modules_0_modules_linear_ads_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_fcs_modules_0_modules_linear_ads_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_acts_modules_0_modules_activation_modules_ln_user_modules_ln_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_acts_modules_0_modules_activation_modules_ln_user_modules_ln_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_acts_modules_0_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_acts_modules_0_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_fcs_modules_1_modules_linear_shared_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_fcs_modules_1_modules_linear_shared_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_fcs_modules_1_modules_linear_ads_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_fcs_modules_1_modules_linear_ads_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_acts_modules_1_modules_activation_modules_ln_user_modules_ln_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_acts_modules_1_modules_activation_modules_ln_user_modules_ln_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_acts_modules_1_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_acts_modules_1_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_fcs_modules_2_modules_linear_shared_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_fcs_modules_2_modules_linear_shared_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_fcs_modules_2_modules_linear_ads_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_fcs_modules_2_modules_linear_ads_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_acts_modules_2_modules_activation_modules_ln_user_modules_ln_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_acts_modules_2_modules_activation_modules_ln_user_modules_ln_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_acts_modules_2_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_acts_modules_2_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_fcs_modules_3_modules_linear_shared_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_fcs_modules_3_modules_linear_shared_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_fcs_modules_3_modules_linear_ads_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_fcs_modules_3_modules_linear_ads_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_acts_modules_3_modules_activation_modules_ln_user_modules_ln_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_acts_modules_3_modules_activation_modules_ln_user_modules_ln_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_acts_modules_3_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_acts_modules_3_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_fcs_modules_4_modules_linear_shared_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_fcs_modules_4_modules_linear_shared_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_fcs_modules_4_modules_linear_ads_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_fcs_modules_4_modules_linear_ads_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_acts_modules_4_modules_activation_modules_ln_user_modules_ln_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_acts_modules_4_modules_activation_modules_ln_user_modules_ln_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_acts_modules_4_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_acts_modules_4_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_output_fc_modules_linear_shared_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_output_fc_modules_linear_shared_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_output_fc_modules_linear_ads_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_output_fc_modules_linear_ads_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_0_modules_post_snn_lce_modules_fc_modules_linear_shared_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_0_modules_post_snn_lce_modules_fc_modules_linear_shared_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_0_modules_post_snn_lce_modules_fc_modules_linear_ads_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_0_modules_post_snn_lce_modules_fc_modules_linear_ads_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_0_modules_ln2_modules_ln_user_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_0_modules_ln2_modules_ln_user_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_0_modules_ln2_modules_ln_ads_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_0_modules_ln2_modules_ln_ads_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_1_modules_maybe_fused_lce_modules_lces_modules_0_modules_fc_modules_linear_shared_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_1_modules_maybe_fused_lce_modules_lces_modules_0_modules_fc_modules_linear_shared_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_1_modules_maybe_fused_lce_modules_lces_modules_0_modules_fc_modules_linear_ads_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_1_modules_maybe_fused_lce_modules_lces_modules_0_modules_fc_modules_linear_ads_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_1_modules_maybe_fused_lce_modules_lces_modules_1_modules_fc_modules_linear_shared_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_1_modules_maybe_fused_lce_modules_lces_modules_1_modules_fc_modules_linear_shared_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_1_modules_maybe_fused_lce_modules_lces_modules_1_modules_fc_modules_linear_ads_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_1_modules_maybe_fused_lce_modules_lces_modules_1_modules_fc_modules_linear_ads_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_1_modules_maybe_fused_lce_modules_lces_modules_2_modules_fc_modules_linear_shared_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_1_modules_maybe_fused_lce_modules_lces_modules_2_modules_fc_modules_linear_shared_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_1_modules_maybe_fused_lce_modules_lces_modules_2_modules_fc_modules_linear_ads_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_1_modules_maybe_fused_lce_modules_lces_modules_2_modules_fc_modules_linear_ads_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_1_modules_bitattn_modules_v_proj_modules_fcs_modules_0_modules_linear_shared_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_1_modules_bitattn_modules_v_proj_modules_fcs_modules_0_modules_linear_shared_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_1_modules_bitattn_modules_v_proj_modules_fcs_modules_0_modules_linear_ads_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_1_modules_bitattn_modules_v_proj_modules_fcs_modules_0_modules_linear_ads_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_1_modules_bitattn_modules_v_proj_modules_acts_modules_0_modules_activation_modules_ln_user_modules_ln_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_1_modules_bitattn_modules_v_proj_modules_acts_modules_0_modules_activation_modules_ln_user_modules_ln_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_1_modules_bitattn_modules_v_proj_modules_acts_modules_0_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_1_modules_bitattn_modules_v_proj_modules_acts_modules_0_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_1_modules_bitattn_modules_v_proj_modules_fcs_modules_1_modules_linear_shared_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_1_modules_bitattn_modules_v_proj_modules_fcs_modules_1_modules_linear_shared_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_1_modules_bitattn_modules_v_proj_modules_fcs_modules_1_modules_linear_ads_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_1_modules_bitattn_modules_v_proj_modules_fcs_modules_1_modules_linear_ads_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_1_modules_bitattn_modules_v_proj_modules_acts_modules_1_modules_activation_modules_ln_user_modules_ln_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_1_modules_bitattn_modules_v_proj_modules_acts_modules_1_modules_activation_modules_ln_user_modules_ln_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_1_modules_bitattn_modules_v_proj_modules_acts_modules_1_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_1_modules_bitattn_modules_v_proj_modules_acts_modules_1_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_1_modules_bitattn_modules_v_proj_modules_output_fc_modules_linear_shared_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_1_modules_bitattn_modules_v_proj_modules_output_fc_modules_linear_shared_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_1_modules_bitattn_modules_v_proj_modules_output_fc_modules_linear_ads_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_1_modules_bitattn_modules_v_proj_modules_output_fc_modules_linear_ads_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_1_modules_ln1_modules_ln_user_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_1_modules_ln1_modules_ln_user_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_1_modules_ln1_modules_ln_ads_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_1_modules_ln1_modules_ln_ads_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_fcs_modules_0_modules_linear_shared_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_fcs_modules_0_modules_linear_shared_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_fcs_modules_0_modules_linear_ads_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_fcs_modules_0_modules_linear_ads_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_acts_modules_0_modules_activation_modules_ln_user_modules_ln_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_acts_modules_0_modules_activation_modules_ln_user_modules_ln_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_acts_modules_0_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_acts_modules_0_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_fcs_modules_1_modules_linear_shared_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_fcs_modules_1_modules_linear_shared_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_fcs_modules_1_modules_linear_ads_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_fcs_modules_1_modules_linear_ads_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_acts_modules_1_modules_activation_modules_ln_user_modules_ln_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_acts_modules_1_modules_activation_modules_ln_user_modules_ln_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_acts_modules_1_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_acts_modules_1_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_fcs_modules_2_modules_linear_shared_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_fcs_modules_2_modules_linear_shared_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_fcs_modules_2_modules_linear_ads_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_fcs_modules_2_modules_linear_ads_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_acts_modules_2_modules_activation_modules_ln_user_modules_ln_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_acts_modules_2_modules_activation_modules_ln_user_modules_ln_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_acts_modules_2_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_acts_modules_2_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_fcs_modules_3_modules_linear_shared_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_fcs_modules_3_modules_linear_shared_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_fcs_modules_3_modules_linear_ads_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_fcs_modules_3_modules_linear_ads_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_acts_modules_3_modules_activation_modules_ln_user_modules_ln_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_acts_modules_3_modules_activation_modules_ln_user_modules_ln_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_acts_modules_3_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_acts_modules_3_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_fcs_modules_4_modules_linear_shared_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_fcs_modules_4_modules_linear_shared_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_fcs_modules_4_modules_linear_ads_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_fcs_modules_4_modules_linear_ads_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_acts_modules_4_modules_activation_modules_ln_user_modules_ln_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_acts_modules_4_modules_activation_modules_ln_user_modules_ln_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_acts_modules_4_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_acts_modules_4_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_output_fc_modules_linear_shared_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_output_fc_modules_linear_shared_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_output_fc_modules_linear_ads_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_output_fc_modules_linear_ads_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_1_modules_post_snn_lce_modules_fc_modules_linear_shared_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_1_modules_post_snn_lce_modules_fc_modules_linear_shared_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_1_modules_post_snn_lce_modules_fc_modules_linear_ads_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_1_modules_post_snn_lce_modules_fc_modules_linear_ads_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_1_modules_ln2_modules_ln_user_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_1_modules_ln2_modules_ln_user_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_1_modules_ln2_modules_ln_ads_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_1_modules_ln2_modules_ln_ads_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_2_modules_maybe_fused_lce_modules_lces_modules_0_modules_fc_modules_linear_shared_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_2_modules_maybe_fused_lce_modules_lces_modules_0_modules_fc_modules_linear_shared_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_2_modules_maybe_fused_lce_modules_lces_modules_0_modules_fc_modules_linear_ads_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_2_modules_maybe_fused_lce_modules_lces_modules_0_modules_fc_modules_linear_ads_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_2_modules_maybe_fused_lce_modules_lces_modules_1_modules_fc_modules_linear_shared_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_2_modules_maybe_fused_lce_modules_lces_modules_1_modules_fc_modules_linear_shared_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_2_modules_maybe_fused_lce_modules_lces_modules_1_modules_fc_modules_linear_ads_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_2_modules_maybe_fused_lce_modules_lces_modules_1_modules_fc_modules_linear_ads_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_2_modules_maybe_fused_lce_modules_lces_modules_2_modules_fc_modules_linear_shared_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_2_modules_maybe_fused_lce_modules_lces_modules_2_modules_fc_modules_linear_shared_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_2_modules_maybe_fused_lce_modules_lces_modules_2_modules_fc_modules_linear_ads_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_2_modules_maybe_fused_lce_modules_lces_modules_2_modules_fc_modules_linear_ads_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_2_modules_bitattn_modules_v_proj_modules_fcs_modules_0_modules_linear_shared_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_2_modules_bitattn_modules_v_proj_modules_fcs_modules_0_modules_linear_shared_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_2_modules_bitattn_modules_v_proj_modules_fcs_modules_0_modules_linear_ads_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_2_modules_bitattn_modules_v_proj_modules_fcs_modules_0_modules_linear_ads_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_2_modules_bitattn_modules_v_proj_modules_acts_modules_0_modules_activation_modules_ln_user_modules_ln_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_2_modules_bitattn_modules_v_proj_modules_acts_modules_0_modules_activation_modules_ln_user_modules_ln_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_2_modules_bitattn_modules_v_proj_modules_acts_modules_0_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_2_modules_bitattn_modules_v_proj_modules_acts_modules_0_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_2_modules_bitattn_modules_v_proj_modules_fcs_modules_1_modules_linear_shared_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_2_modules_bitattn_modules_v_proj_modules_fcs_modules_1_modules_linear_shared_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_2_modules_bitattn_modules_v_proj_modules_fcs_modules_1_modules_linear_ads_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_2_modules_bitattn_modules_v_proj_modules_fcs_modules_1_modules_linear_ads_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_2_modules_bitattn_modules_v_proj_modules_acts_modules_1_modules_activation_modules_ln_user_modules_ln_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_2_modules_bitattn_modules_v_proj_modules_acts_modules_1_modules_activation_modules_ln_user_modules_ln_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_2_modules_bitattn_modules_v_proj_modules_acts_modules_1_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_2_modules_bitattn_modules_v_proj_modules_acts_modules_1_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_2_modules_bitattn_modules_v_proj_modules_output_fc_modules_linear_shared_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_2_modules_bitattn_modules_v_proj_modules_output_fc_modules_linear_shared_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_2_modules_bitattn_modules_v_proj_modules_output_fc_modules_linear_ads_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_2_modules_bitattn_modules_v_proj_modules_output_fc_modules_linear_ads_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_2_modules_ln1_modules_ln_user_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_2_modules_ln1_modules_ln_user_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_2_modules_ln1_modules_ln_ads_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_2_modules_ln1_modules_ln_ads_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_fcs_modules_0_modules_linear_shared_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_fcs_modules_0_modules_linear_shared_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_fcs_modules_0_modules_linear_ads_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_fcs_modules_0_modules_linear_ads_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_acts_modules_0_modules_activation_modules_ln_user_modules_ln_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_acts_modules_0_modules_activation_modules_ln_user_modules_ln_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_acts_modules_0_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_acts_modules_0_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_fcs_modules_1_modules_linear_shared_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_fcs_modules_1_modules_linear_shared_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_fcs_modules_1_modules_linear_ads_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_fcs_modules_1_modules_linear_ads_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_acts_modules_1_modules_activation_modules_ln_user_modules_ln_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_acts_modules_1_modules_activation_modules_ln_user_modules_ln_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_acts_modules_1_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_acts_modules_1_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_fcs_modules_2_modules_linear_shared_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_fcs_modules_2_modules_linear_shared_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_fcs_modules_2_modules_linear_ads_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_fcs_modules_2_modules_linear_ads_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_acts_modules_2_modules_activation_modules_ln_user_modules_ln_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_acts_modules_2_modules_activation_modules_ln_user_modules_ln_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_acts_modules_2_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_acts_modules_2_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_fcs_modules_3_modules_linear_shared_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_fcs_modules_3_modules_linear_shared_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_fcs_modules_3_modules_linear_ads_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_fcs_modules_3_modules_linear_ads_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_acts_modules_3_modules_activation_modules_ln_user_modules_ln_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_acts_modules_3_modules_activation_modules_ln_user_modules_ln_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_acts_modules_3_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_acts_modules_3_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_fcs_modules_4_modules_linear_shared_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_fcs_modules_4_modules_linear_shared_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_fcs_modules_4_modules_linear_ads_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_fcs_modules_4_modules_linear_ads_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_acts_modules_4_modules_activation_modules_ln_user_modules_ln_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_acts_modules_4_modules_activation_modules_ln_user_modules_ln_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_acts_modules_4_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_acts_modules_4_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_output_fc_modules_linear_shared_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_output_fc_modules_linear_shared_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_output_fc_modules_linear_ads_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_output_fc_modules_linear_ads_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_2_modules_post_snn_lce_modules_fc_modules_linear_shared_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_2_modules_post_snn_lce_modules_fc_modules_linear_shared_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_2_modules_post_snn_lce_modules_fc_modules_linear_ads_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_2_modules_post_snn_lce_modules_fc_modules_linear_ads_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_2_modules_ln2_modules_ln_user_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_2_modules_ln2_modules_ln_user_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_2_modules_ln2_modules_ln_ads_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_2_modules_ln2_modules_ln_ads_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_3_modules_maybe_fused_lce_modules_lces_modules_1_modules_fc_modules_linear_shared_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_3_modules_maybe_fused_lce_modules_lces_modules_1_modules_fc_modules_linear_shared_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_3_modules_maybe_fused_lce_modules_lces_modules_1_modules_fc_modules_linear_ads_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_3_modules_maybe_fused_lce_modules_lces_modules_1_modules_fc_modules_linear_ads_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_3_modules_bitattn_modules_v_proj_modules_fcs_modules_0_modules_linear_shared_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_3_modules_bitattn_modules_v_proj_modules_fcs_modules_0_modules_linear_shared_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_3_modules_bitattn_modules_v_proj_modules_fcs_modules_0_modules_linear_ads_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_3_modules_bitattn_modules_v_proj_modules_fcs_modules_0_modules_linear_ads_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_3_modules_bitattn_modules_v_proj_modules_acts_modules_0_modules_activation_modules_ln_user_modules_ln_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_3_modules_bitattn_modules_v_proj_modules_acts_modules_0_modules_activation_modules_ln_user_modules_ln_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_3_modules_bitattn_modules_v_proj_modules_acts_modules_0_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_3_modules_bitattn_modules_v_proj_modules_acts_modules_0_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_3_modules_bitattn_modules_v_proj_modules_fcs_modules_1_modules_linear_shared_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_3_modules_bitattn_modules_v_proj_modules_fcs_modules_1_modules_linear_shared_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_3_modules_bitattn_modules_v_proj_modules_fcs_modules_1_modules_linear_ads_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_3_modules_bitattn_modules_v_proj_modules_fcs_modules_1_modules_linear_ads_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_3_modules_bitattn_modules_v_proj_modules_acts_modules_1_modules_activation_modules_ln_user_modules_ln_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_3_modules_bitattn_modules_v_proj_modules_acts_modules_1_modules_activation_modules_ln_user_modules_ln_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_3_modules_bitattn_modules_v_proj_modules_acts_modules_1_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_3_modules_bitattn_modules_v_proj_modules_acts_modules_1_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_3_modules_bitattn_modules_v_proj_modules_output_fc_modules_linear_shared_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_3_modules_bitattn_modules_v_proj_modules_output_fc_modules_linear_shared_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_3_modules_bitattn_modules_v_proj_modules_output_fc_modules_linear_ads_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_3_modules_bitattn_modules_v_proj_modules_output_fc_modules_linear_ads_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_3_modules_ln1_modules_ln_user_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_3_modules_ln1_modules_ln_user_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_3_modules_ln1_modules_ln_ads_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_3_modules_ln1_modules_ln_ads_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_fcs_modules_0_modules_linear_shared_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_fcs_modules_0_modules_linear_shared_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_fcs_modules_0_modules_linear_ads_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_fcs_modules_0_modules_linear_ads_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_acts_modules_0_modules_activation_modules_ln_user_modules_ln_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_acts_modules_0_modules_activation_modules_ln_user_modules_ln_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_acts_modules_0_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_acts_modules_0_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_fcs_modules_1_modules_linear_shared_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_fcs_modules_1_modules_linear_shared_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_fcs_modules_1_modules_linear_ads_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_fcs_modules_1_modules_linear_ads_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_acts_modules_1_modules_activation_modules_ln_user_modules_ln_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_acts_modules_1_modules_activation_modules_ln_user_modules_ln_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_acts_modules_1_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_acts_modules_1_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_fcs_modules_2_modules_linear_shared_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_fcs_modules_2_modules_linear_shared_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_fcs_modules_2_modules_linear_ads_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_fcs_modules_2_modules_linear_ads_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_acts_modules_2_modules_activation_modules_ln_user_modules_ln_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_acts_modules_2_modules_activation_modules_ln_user_modules_ln_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_acts_modules_2_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_acts_modules_2_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_fcs_modules_3_modules_linear_shared_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_fcs_modules_3_modules_linear_shared_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_fcs_modules_3_modules_linear_ads_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_fcs_modules_3_modules_linear_ads_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_acts_modules_3_modules_activation_modules_ln_user_modules_ln_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_acts_modules_3_modules_activation_modules_ln_user_modules_ln_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_acts_modules_3_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_acts_modules_3_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_fcs_modules_4_modules_linear_shared_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_fcs_modules_4_modules_linear_shared_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_fcs_modules_4_modules_linear_ads_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_fcs_modules_4_modules_linear_ads_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_acts_modules_4_modules_activation_modules_ln_user_modules_ln_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_acts_modules_4_modules_activation_modules_ln_user_modules_ln_parameters_weight_ | |
| l_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_acts_modules_4_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ = L_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_acts_modules_4_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/aps_models/ads/gmp/models/mtml_ctr_instagram_model/managed/Y2026Q1/mtml_ctr_instagram_model_827295674_v0_fork.py:614 in forward, code: input_embs.new_ones(input_embs.size(0), dtype=torch.int32).cumsum(0) - 1 | |
| new_ones: "i32[4096][1]cuda:0" = l_input_embs_.new_ones(4096, dtype = torch.int32) | |
| cumsum: "i64[4096][1]cuda:0" = new_ones.cumsum(0); new_ones = None | |
| dummy_ads_to_user_map: "i64[4096][1]cuda:0" = cumsum - 1; cumsum = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/aps_models/ads/gmp/models/mtml_ctr_instagram_model/managed/Y2026Q1/mtml_ctr_instagram_model_827295674_v0_fork.py:635 in forward, code: spec_proj = [self.spec_projection(spec_embs)] if self.spec_projection else [] | |
| linear: "bf16[4096, 2048][2048, 1]cuda:0" = torch._C._nn.linear(l_raw_input_features_spec_embs_, l_self_modules_spec_projection_parameters_weight_, l_self_modules_spec_projection_parameters_bias_); l_raw_input_features_spec_embs_ = l_self_modules_spec_projection_parameters_weight_ = l_self_modules_spec_projection_parameters_bias_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/aps_models/ads/gmp/models/mtml_ctr_instagram_model/managed/Y2026Q1/mtml_ctr_instagram_model_827295674_v0_fork.py:637 in forward, code: [self.spec_projection_user(spec_embs_user)] | |
| linear_1: "bf16[4096, 2048][2048, 1]cuda:0" = torch._C._nn.linear(l_raw_input_features_spec_embs_user_, l_self_modules_spec_projection_user_parameters_weight_, l_self_modules_spec_projection_user_parameters_bias_); l_raw_input_features_spec_embs_user_ = l_self_modules_spec_projection_user_parameters_weight_ = l_self_modules_spec_projection_user_parameters_bias_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/aps_models/ads/gmp/models/mtml_ctr_instagram_model/managed/Y2026Q1/mtml_ctr_instagram_model_827295674_v0_fork.py:655 in forward, code: torch.cat([dense_projection_user] + spec_proj_user, dim=1), | |
| cat: "f32[4096, 4096][4096, 1]cuda:0" = torch.cat([l_raw_input_features_dense_projection_user_, linear_1], dim = 1); l_raw_input_features_dense_projection_user_ = linear_1 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/aps_models/ads/gmp/models/mtml_ctr_instagram_model/managed/Y2026Q1/mtml_ctr_instagram_model_827295674_v0_fork.py:656 in forward, code: torch.cat([dense_projection] + spec_proj, dim=1), | |
| cat_1: "f32[4096, 4096][4096, 1]cuda:0" = torch.cat([l_raw_input_features_dense_projection_, linear], dim = 1); l_raw_input_features_dense_projection_ = linear = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torch/nn/functional.py:2958 in rms_norm, code: return torch.rms_norm(input, normalized_shape, weight, eps) | |
| res_user: "f32[4096, 759, 128][97152, 128, 1]cuda:0" = torch.rms_norm(l_raw_input_features_input_embs_user_, (128,), l_self_modules_wukong_modules_ln_modules_ln_user_parameters_weight_, None); l_raw_input_features_input_embs_user_ = l_self_modules_wukong_modules_ln_modules_ln_user_parameters_weight_ = None | |
| res_ads: "f32[4096, 523, 128][66944, 128, 1]cuda:0" = torch.rms_norm(l_input_embs_, (128,), l_self_modules_wukong_modules_ln_modules_ln_ads_parameters_weight_, None); l_input_embs_ = l_self_modules_wukong_modules_ln_modules_ln_ads_parameters_weight_ = None | |
| res_user_1: "f32[4096, 4096][4096, 1]cuda:0" = torch.rms_norm(cat, (4096,), l_self_modules_wukong_modules_ln2_modules_ln_user_parameters_weight_, None); cat = l_self_modules_wukong_modules_ln2_modules_ln_user_parameters_weight_ = None | |
| res_ads_1: "f32[4096, 4096][4096, 1]cuda:0" = torch.rms_norm(cat_1, (4096,), l_self_modules_wukong_modules_ln2_modules_ln_ads_parameters_weight_, None); cat_1 = l_self_modules_wukong_modules_ln2_modules_ln_ads_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:488 in <lambda>, code: lambda: self.linear_shared.weight @ x_user | |
| matmul: "bf16[4096, 192, 128][24576, 128, 1]cuda:0" = l_self_modules_wukong_modules_layers_modules_0_modules_maybe_fused_lce_modules_lces_modules_0_modules_fc_modules_linear_shared_parameters_weight_ @ res_user; l_self_modules_wukong_modules_layers_modules_0_modules_maybe_fused_lce_modules_lces_modules_0_modules_fc_modules_linear_shared_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:490 in forward_lce, code: res_user = res_shared[:, : self.num_out_user] | |
| res_user_2: "bf16[4096, 96, 128][24576, 128, 1]cuda:0" = matmul[(slice(None, None, None), slice(None, 96, None))] | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:493 in <lambda>, code: lambda: self.linear_ads.weight @ x_ads | |
| matmul_1: "bf16[4096, 96, 128][12288, 128, 1]cuda:0" = l_self_modules_wukong_modules_layers_modules_0_modules_maybe_fused_lce_modules_lces_modules_0_modules_fc_modules_linear_ads_parameters_weight_ @ res_ads; l_self_modules_wukong_modules_layers_modules_0_modules_maybe_fused_lce_modules_lces_modules_0_modules_fc_modules_linear_ads_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:496 in <lambda>, code: res_shared[:, self.num_out_user :], ads_to_user_map | |
| getitem_1: "bf16[4096, 96, 128][24576, 128, 1]cuda:0" = matmul[(slice(None, None, None), slice(96, None, None))]; matmul = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/common.py:64 in forward, code: return torch.index_select(x, dim=0, index=batch_indices) | |
| index_select: "bf16[4096, 96, 128][12288, 128, 1]cuda:0" = torch.index_select(getitem_1, dim = 0, index = dummy_ads_to_user_map); getitem_1 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:493 in <lambda>, code: lambda: self.linear_ads.weight @ x_ads | |
| res_ads_2: "bf16[4096, 96, 128][12288, 128, 1]cuda:0" = matmul_1 + index_select; matmul_1 = index_select = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:488 in <lambda>, code: lambda: self.linear_shared.weight @ x_user | |
| matmul_2: "bf16[4096, 32, 128][4096, 128, 1]cuda:0" = l_self_modules_wukong_modules_layers_modules_0_modules_maybe_fused_lce_modules_lces_modules_1_modules_fc_modules_linear_shared_parameters_weight_ @ res_user; l_self_modules_wukong_modules_layers_modules_0_modules_maybe_fused_lce_modules_lces_modules_1_modules_fc_modules_linear_shared_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:490 in forward_lce, code: res_user = res_shared[:, : self.num_out_user] | |
| res_user_3: "bf16[4096, 16, 128][4096, 128, 1]cuda:0" = matmul_2[(slice(None, None, None), slice(None, 16, None))] | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:493 in <lambda>, code: lambda: self.linear_ads.weight @ x_ads | |
| matmul_3: "bf16[4096, 16, 128][2048, 128, 1]cuda:0" = l_self_modules_wukong_modules_layers_modules_0_modules_maybe_fused_lce_modules_lces_modules_1_modules_fc_modules_linear_ads_parameters_weight_ @ res_ads; l_self_modules_wukong_modules_layers_modules_0_modules_maybe_fused_lce_modules_lces_modules_1_modules_fc_modules_linear_ads_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:496 in <lambda>, code: res_shared[:, self.num_out_user :], ads_to_user_map | |
| getitem_3: "bf16[4096, 16, 128][4096, 128, 1]cuda:0" = matmul_2[(slice(None, None, None), slice(16, None, None))]; matmul_2 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/common.py:64 in forward, code: return torch.index_select(x, dim=0, index=batch_indices) | |
| index_select_1: "bf16[4096, 16, 128][2048, 128, 1]cuda:0" = torch.index_select(getitem_3, dim = 0, index = dummy_ads_to_user_map); getitem_3 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:493 in <lambda>, code: lambda: self.linear_ads.weight @ x_ads | |
| res_ads_3: "bf16[4096, 16, 128][2048, 128, 1]cuda:0" = matmul_3 + index_select_1; matmul_3 = index_select_1 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:488 in <lambda>, code: lambda: self.linear_shared.weight @ x_user | |
| matmul_4: "bf16[4096, 192, 128][24576, 128, 1]cuda:0" = l_self_modules_wukong_modules_layers_modules_0_modules_maybe_fused_lce_modules_lces_modules_2_modules_fc_modules_linear_shared_parameters_weight_ @ res_user; l_self_modules_wukong_modules_layers_modules_0_modules_maybe_fused_lce_modules_lces_modules_2_modules_fc_modules_linear_shared_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:490 in forward_lce, code: res_user = res_shared[:, : self.num_out_user] | |
| res_user_4: "bf16[4096, 96, 128][24576, 128, 1]cuda:0" = matmul_4[(slice(None, None, None), slice(None, 96, None))] | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:493 in <lambda>, code: lambda: self.linear_ads.weight @ x_ads | |
| matmul_5: "bf16[4096, 96, 128][12288, 128, 1]cuda:0" = l_self_modules_wukong_modules_layers_modules_0_modules_maybe_fused_lce_modules_lces_modules_2_modules_fc_modules_linear_ads_parameters_weight_ @ res_ads; l_self_modules_wukong_modules_layers_modules_0_modules_maybe_fused_lce_modules_lces_modules_2_modules_fc_modules_linear_ads_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:496 in <lambda>, code: res_shared[:, self.num_out_user :], ads_to_user_map | |
| getitem_5: "bf16[4096, 96, 128][24576, 128, 1]cuda:0" = matmul_4[(slice(None, None, None), slice(96, None, None))]; matmul_4 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/common.py:64 in forward, code: return torch.index_select(x, dim=0, index=batch_indices) | |
| index_select_2: "bf16[4096, 96, 128][12288, 128, 1]cuda:0" = torch.index_select(getitem_5, dim = 0, index = dummy_ads_to_user_map); getitem_5 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:493 in <lambda>, code: lambda: self.linear_ads.weight @ x_ads | |
| res_ads_4: "bf16[4096, 96, 128][12288, 128, 1]cuda:0" = matmul_5 + index_select_2; matmul_5 = index_select_2 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:189 in view_3d_to_2d, code: self.user.view(self.user.shape[0], emb_dim), | |
| view: "bf16[4096, 2048][4096, 1]cuda:0" = res_user_3.view(4096, -1); res_user_3 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:190 in view_3d_to_2d, code: self.ads.view(self.ads.shape[0], emb_dim), | |
| view_1: "bf16[4096, 2048][2048, 1]cuda:0" = res_ads_3.view(4096, -1); res_ads_3 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:515 in <lambda>, code: res_shared = self.user_compute(lambda: self.linear_shared(x_user)) | |
| linear_2: "bf16[4096, 512][512, 1]cuda:0" = torch._C._nn.linear(view, l_self_modules_wukong_modules_layers_modules_0_modules_bitattn_modules_v_proj_modules_fcs_modules_0_modules_linear_shared_parameters_weight_, None); view = l_self_modules_wukong_modules_layers_modules_0_modules_bitattn_modules_v_proj_modules_fcs_modules_0_modules_linear_shared_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:516 in forward_linear, code: res_user = res_shared[:, : self.num_out_user] | |
| res_user_5: "bf16[4096, 256][512, 1]cuda:0" = linear_2[(slice(None, None, None), slice(None, 256, None))] | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| linear_3: "bf16[4096, 256][256, 1]cuda:0" = torch._C._nn.linear(view_1, l_self_modules_wukong_modules_layers_modules_0_modules_bitattn_modules_v_proj_modules_fcs_modules_0_modules_linear_ads_parameters_weight_, None); view_1 = l_self_modules_wukong_modules_layers_modules_0_modules_bitattn_modules_v_proj_modules_fcs_modules_0_modules_linear_ads_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:521 in <lambda>, code: self.broadcast(res_shared[:, self.num_out_user :], ads_to_user_map), | |
| getitem_7: "bf16[4096, 256][512, 1]cuda:0" = linear_2[(slice(None, None, None), slice(256, None, None))]; linear_2 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/common.py:64 in forward, code: return torch.index_select(x, dim=0, index=batch_indices) | |
| index_select_3: "bf16[4096, 256][256, 1]cuda:0" = torch.index_select(getitem_7, dim = 0, index = dummy_ads_to_user_map); getitem_7 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| res_ads_5: "bf16[4096, 256][256, 1]cuda:0" = linear_3 + index_select_3; linear_3 = index_select_3 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torch/nn/functional.py:2958 in rms_norm, code: return torch.rms_norm(input, normalized_shape, weight, eps) | |
| output: "f32[4096, 256][256, 1]cuda:0" = torch.rms_norm(res_user_5, (256,), l_self_modules_wukong_modules_layers_modules_0_modules_bitattn_modules_v_proj_modules_acts_modules_0_modules_activation_modules_ln_user_modules_ln_parameters_weight_, None); res_user_5 = l_self_modules_wukong_modules_layers_modules_0_modules_bitattn_modules_v_proj_modules_acts_modules_0_modules_activation_modules_ln_user_modules_ln_parameters_weight_ = None | |
| output_1: "f32[4096, 256][256, 1]cuda:0" = torch.rms_norm(res_ads_5, (256,), l_self_modules_wukong_modules_layers_modules_0_modules_bitattn_modules_v_proj_modules_acts_modules_0_modules_activation_modules_ln_ads_modules_ln_parameters_weight_, None); res_ads_5 = l_self_modules_wukong_modules_layers_modules_0_modules_bitattn_modules_v_proj_modules_acts_modules_0_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:321 in forward, code: return self.activation(x) | |
| sigmoid: "f32[4096, 256][256, 1]cuda:0" = torch.sigmoid(output) | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:249 in forward, code: x_user * self.act(x_user), | |
| mul: "f32[4096, 256][256, 1]cuda:0" = output * sigmoid; output = sigmoid = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:321 in forward, code: return self.activation(x) | |
| sigmoid_1: "f32[4096, 256][256, 1]cuda:0" = torch.sigmoid(output_1) | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:250 in forward, code: x_ads * self.act(x_ads), | |
| mul_1: "f32[4096, 256][256, 1]cuda:0" = output_1 * sigmoid_1; output_1 = sigmoid_1 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:515 in <lambda>, code: res_shared = self.user_compute(lambda: self.linear_shared(x_user)) | |
| linear_4: "bf16[4096, 512][512, 1]cuda:0" = torch._C._nn.linear(mul, l_self_modules_wukong_modules_layers_modules_0_modules_bitattn_modules_v_proj_modules_fcs_modules_1_modules_linear_shared_parameters_weight_, None); mul = l_self_modules_wukong_modules_layers_modules_0_modules_bitattn_modules_v_proj_modules_fcs_modules_1_modules_linear_shared_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:516 in forward_linear, code: res_user = res_shared[:, : self.num_out_user] | |
| res_user_6: "bf16[4096, 256][512, 1]cuda:0" = linear_4[(slice(None, None, None), slice(None, 256, None))] | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| linear_5: "bf16[4096, 256][256, 1]cuda:0" = torch._C._nn.linear(mul_1, l_self_modules_wukong_modules_layers_modules_0_modules_bitattn_modules_v_proj_modules_fcs_modules_1_modules_linear_ads_parameters_weight_, None); mul_1 = l_self_modules_wukong_modules_layers_modules_0_modules_bitattn_modules_v_proj_modules_fcs_modules_1_modules_linear_ads_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:521 in <lambda>, code: self.broadcast(res_shared[:, self.num_out_user :], ads_to_user_map), | |
| getitem_9: "bf16[4096, 256][512, 1]cuda:0" = linear_4[(slice(None, None, None), slice(256, None, None))]; linear_4 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/common.py:64 in forward, code: return torch.index_select(x, dim=0, index=batch_indices) | |
| index_select_4: "bf16[4096, 256][256, 1]cuda:0" = torch.index_select(getitem_9, dim = 0, index = dummy_ads_to_user_map); getitem_9 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| res_ads_6: "bf16[4096, 256][256, 1]cuda:0" = linear_5 + index_select_4; linear_5 = index_select_4 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torch/nn/functional.py:2958 in rms_norm, code: return torch.rms_norm(input, normalized_shape, weight, eps) | |
| output_2: "f32[4096, 256][256, 1]cuda:0" = torch.rms_norm(res_user_6, (256,), l_self_modules_wukong_modules_layers_modules_0_modules_bitattn_modules_v_proj_modules_acts_modules_1_modules_activation_modules_ln_user_modules_ln_parameters_weight_, None); res_user_6 = l_self_modules_wukong_modules_layers_modules_0_modules_bitattn_modules_v_proj_modules_acts_modules_1_modules_activation_modules_ln_user_modules_ln_parameters_weight_ = None | |
| output_3: "f32[4096, 256][256, 1]cuda:0" = torch.rms_norm(res_ads_6, (256,), l_self_modules_wukong_modules_layers_modules_0_modules_bitattn_modules_v_proj_modules_acts_modules_1_modules_activation_modules_ln_ads_modules_ln_parameters_weight_, None); res_ads_6 = l_self_modules_wukong_modules_layers_modules_0_modules_bitattn_modules_v_proj_modules_acts_modules_1_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:321 in forward, code: return self.activation(x) | |
| sigmoid_2: "f32[4096, 256][256, 1]cuda:0" = torch.sigmoid(output_2) | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:249 in forward, code: x_user * self.act(x_user), | |
| mul_2: "f32[4096, 256][256, 1]cuda:0" = output_2 * sigmoid_2; output_2 = sigmoid_2 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:321 in forward, code: return self.activation(x) | |
| sigmoid_3: "f32[4096, 256][256, 1]cuda:0" = torch.sigmoid(output_3) | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:250 in forward, code: x_ads * self.act(x_ads), | |
| mul_3: "f32[4096, 256][256, 1]cuda:0" = output_3 * sigmoid_3; output_3 = sigmoid_3 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:515 in <lambda>, code: res_shared = self.user_compute(lambda: self.linear_shared(x_user)) | |
| linear_6: "bf16[4096, 41024][41024, 1]cuda:0" = torch._C._nn.linear(mul_2, l_self_modules_wukong_modules_layers_modules_0_modules_bitattn_modules_v_proj_modules_output_fc_modules_linear_shared_parameters_weight_, None); mul_2 = l_self_modules_wukong_modules_layers_modules_0_modules_bitattn_modules_v_proj_modules_output_fc_modules_linear_shared_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:516 in forward_linear, code: res_user = res_shared[:, : self.num_out_user] | |
| res_user_7: "bf16[4096, 20512][41024, 1]cuda:0" = linear_6[(slice(None, None, None), slice(None, 20512, None))] | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| linear_7: "bf16[4096, 20512][20512, 1]cuda:0" = torch._C._nn.linear(mul_3, l_self_modules_wukong_modules_layers_modules_0_modules_bitattn_modules_v_proj_modules_output_fc_modules_linear_ads_parameters_weight_, None); mul_3 = l_self_modules_wukong_modules_layers_modules_0_modules_bitattn_modules_v_proj_modules_output_fc_modules_linear_ads_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:521 in <lambda>, code: self.broadcast(res_shared[:, self.num_out_user :], ads_to_user_map), | |
| getitem_11: "bf16[4096, 20512][41024, 1]cuda:0" = linear_6[(slice(None, None, None), slice(20512, None, None))]; linear_6 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/common.py:64 in forward, code: return torch.index_select(x, dim=0, index=batch_indices) | |
| index_select_5: "bf16[4096, 20512][20512, 1]cuda:0" = torch.index_select(getitem_11, dim = 0, index = dummy_ads_to_user_map); getitem_11 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| res_ads_7: "bf16[4096, 20512][20512, 1]cuda:0" = linear_7 + index_select_5; linear_7 = index_select_5 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:181 in view_2d_to_3d, code: self.user.view(self.user.shape[0], num_emb, emb_dim), | |
| view_2: "bf16[4096, 16, 1282][41024, 1282, 1]cuda:0" = res_user_7.view(4096, -1, 1282); res_user_7 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:182 in view_2d_to_3d, code: self.ads.view(self.ads.shape[0], num_emb, emb_dim), | |
| view_3: "bf16[4096, 16, 1282][20512, 1282, 1]cuda:0" = res_ads_7.view(4096, -1, 1282); res_ads_7 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/common.py:64 in forward, code: return torch.index_select(x, dim=0, index=batch_indices) | |
| index_select_6: "f32[4096, 759, 128][97152, 128, 1]cuda:0" = torch.index_select(res_user, dim = 0, index = dummy_ads_to_user_map) | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:980 in forward, code: x_all = torch.cat( | |
| cat_2: "f32[4096, 1282, 128][164096, 128, 1]cuda:0" = torch.cat([index_select_6, res_ads], dim = 1); index_select_6 = res_ads = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:989 in <lambda>, code: lambda: v_ads @ x_all | |
| matmul_6: "bf16[4096, 16, 128][2048, 128, 1]cuda:0" = view_3 @ cat_2; view_3 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:993 in <lambda>, code: lambda: v_user[:, :, :num_emb_user] @ x.user | |
| getitem_12: "bf16[4096, 16, 759][41024, 1282, 1]cuda:0" = view_2[(slice(None, None, None), slice(None, None, None), slice(None, 759, None))]; view_2 = None | |
| matmul_7: "bf16[4096, 16, 128][2048, 128, 1]cuda:0" = getitem_12 @ res_user; getitem_12 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:1006 in <lambda>, code: lambda: vx_ads @ x_all.permute(0, 2, 1) | |
| permute: "f32[4096, 128, 1282][164096, 1, 128]cuda:0" = cat_2.permute(0, 2, 1); cat_2 = None | |
| res_ads_8: "bf16[4096, 16, 1282][20512, 1282, 1]cuda:0" = matmul_6 @ permute; matmul_6 = permute = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:1010 in <lambda>, code: lambda: vx_user @ x.user.permute(0, 2, 1) | |
| permute_1: "f32[4096, 128, 759][97152, 1, 128]cuda:0" = res_user.permute(0, 2, 1); res_user = None | |
| res_user_8: "bf16[4096, 16, 759][12144, 759, 1]cuda:0" = matmul_7 @ permute_1; matmul_7 = permute_1 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:189 in view_3d_to_2d, code: self.user.view(self.user.shape[0], emb_dim), | |
| view_4: "bf16[4096, 12144][12144, 1]cuda:0" = res_user_8.view(4096, -1); res_user_8 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:190 in view_3d_to_2d, code: self.ads.view(self.ads.shape[0], emb_dim), | |
| view_5: "bf16[4096, 20512][20512, 1]cuda:0" = res_ads_8.view(4096, -1); res_ads_8 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:208 in cat, code: torch.cat([t.user for t in tensors], dim=1), | |
| cat_3: "f32[4096, 16240][16240, 1]cuda:0" = torch.cat([view_4, res_user_1], dim = 1); view_4 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:209 in cat, code: torch.cat([t.ads for t in tensors], dim=1), | |
| cat_4: "f32[4096, 24608][24608, 1]cuda:0" = torch.cat([view_5, res_ads_1], dim = 1); view_5 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torch/nn/functional.py:2958 in rms_norm, code: return torch.rms_norm(input, normalized_shape, weight, eps) | |
| res_user_9: "f32[4096, 16240][16240, 1]cuda:0" = torch.rms_norm(cat_3, (16240,), l_self_modules_wukong_modules_layers_modules_0_modules_ln1_modules_ln_user_parameters_weight_, None); cat_3 = l_self_modules_wukong_modules_layers_modules_0_modules_ln1_modules_ln_user_parameters_weight_ = None | |
| res_ads_9: "f32[4096, 24608][24608, 1]cuda:0" = torch.rms_norm(cat_4, (24608,), l_self_modules_wukong_modules_layers_modules_0_modules_ln1_modules_ln_ads_parameters_weight_, None); cat_4 = l_self_modules_wukong_modules_layers_modules_0_modules_ln1_modules_ln_ads_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:515 in <lambda>, code: res_shared = self.user_compute(lambda: self.linear_shared(x_user)) | |
| linear_8: "bf16[4096, 9216][9216, 1]cuda:0" = torch._C._nn.linear(res_user_9, l_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_fcs_modules_0_modules_linear_shared_parameters_weight_, None); res_user_9 = l_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_fcs_modules_0_modules_linear_shared_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:516 in forward_linear, code: res_user = res_shared[:, : self.num_out_user] | |
| res_user_10: "bf16[4096, 4608][9216, 1]cuda:0" = linear_8[(slice(None, None, None), slice(None, 4608, None))] | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| linear_9: "bf16[4096, 4608][4608, 1]cuda:0" = torch._C._nn.linear(res_ads_9, l_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_fcs_modules_0_modules_linear_ads_parameters_weight_, None); res_ads_9 = l_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_fcs_modules_0_modules_linear_ads_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:521 in <lambda>, code: self.broadcast(res_shared[:, self.num_out_user :], ads_to_user_map), | |
| getitem_14: "bf16[4096, 4608][9216, 1]cuda:0" = linear_8[(slice(None, None, None), slice(4608, None, None))]; linear_8 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/common.py:64 in forward, code: return torch.index_select(x, dim=0, index=batch_indices) | |
| index_select_7: "bf16[4096, 4608][4608, 1]cuda:0" = torch.index_select(getitem_14, dim = 0, index = dummy_ads_to_user_map); getitem_14 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| res_ads_10: "bf16[4096, 4608][4608, 1]cuda:0" = linear_9 + index_select_7; linear_9 = index_select_7 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torch/nn/functional.py:2958 in rms_norm, code: return torch.rms_norm(input, normalized_shape, weight, eps) | |
| output_4: "f32[4096, 4608][4608, 1]cuda:0" = torch.rms_norm(res_user_10, (4608,), l_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_acts_modules_0_modules_activation_modules_ln_user_modules_ln_parameters_weight_, None); res_user_10 = l_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_acts_modules_0_modules_activation_modules_ln_user_modules_ln_parameters_weight_ = None | |
| output_5: "f32[4096, 4608][4608, 1]cuda:0" = torch.rms_norm(res_ads_10, (4608,), l_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_acts_modules_0_modules_activation_modules_ln_ads_modules_ln_parameters_weight_, None); res_ads_10 = l_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_acts_modules_0_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:321 in forward, code: return self.activation(x) | |
| sigmoid_4: "f32[4096, 4608][4608, 1]cuda:0" = torch.sigmoid(output_4) | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:249 in forward, code: x_user * self.act(x_user), | |
| mul_4: "f32[4096, 4608][4608, 1]cuda:0" = output_4 * sigmoid_4; output_4 = sigmoid_4 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:321 in forward, code: return self.activation(x) | |
| sigmoid_5: "f32[4096, 4608][4608, 1]cuda:0" = torch.sigmoid(output_5) | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:250 in forward, code: x_ads * self.act(x_ads), | |
| mul_5: "f32[4096, 4608][4608, 1]cuda:0" = output_5 * sigmoid_5; output_5 = sigmoid_5 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:515 in <lambda>, code: res_shared = self.user_compute(lambda: self.linear_shared(x_user)) | |
| linear_10: "bf16[4096, 4608][4608, 1]cuda:0" = torch._C._nn.linear(mul_4, l_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_fcs_modules_1_modules_linear_shared_parameters_weight_, None); l_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_fcs_modules_1_modules_linear_shared_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:516 in forward_linear, code: res_user = res_shared[:, : self.num_out_user] | |
| res_user_11: "bf16[4096, 2304][4608, 1]cuda:0" = linear_10[(slice(None, None, None), slice(None, 2304, None))] | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| linear_11: "bf16[4096, 2304][2304, 1]cuda:0" = torch._C._nn.linear(mul_5, l_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_fcs_modules_1_modules_linear_ads_parameters_weight_, None); l_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_fcs_modules_1_modules_linear_ads_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:521 in <lambda>, code: self.broadcast(res_shared[:, self.num_out_user :], ads_to_user_map), | |
| getitem_16: "bf16[4096, 2304][4608, 1]cuda:0" = linear_10[(slice(None, None, None), slice(2304, None, None))]; linear_10 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/common.py:64 in forward, code: return torch.index_select(x, dim=0, index=batch_indices) | |
| index_select_8: "bf16[4096, 2304][2304, 1]cuda:0" = torch.index_select(getitem_16, dim = 0, index = dummy_ads_to_user_map); getitem_16 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| res_ads_11: "bf16[4096, 2304][2304, 1]cuda:0" = linear_11 + index_select_8; linear_11 = index_select_8 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torch/nn/functional.py:2958 in rms_norm, code: return torch.rms_norm(input, normalized_shape, weight, eps) | |
| output_6: "f32[4096, 2304][2304, 1]cuda:0" = torch.rms_norm(res_user_11, (2304,), l_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_acts_modules_1_modules_activation_modules_ln_user_modules_ln_parameters_weight_, None); res_user_11 = l_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_acts_modules_1_modules_activation_modules_ln_user_modules_ln_parameters_weight_ = None | |
| output_7: "f32[4096, 2304][2304, 1]cuda:0" = torch.rms_norm(res_ads_11, (2304,), l_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_acts_modules_1_modules_activation_modules_ln_ads_modules_ln_parameters_weight_, None); res_ads_11 = l_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_acts_modules_1_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:321 in forward, code: return self.activation(x) | |
| sigmoid_6: "f32[4096, 2304][2304, 1]cuda:0" = torch.sigmoid(output_6) | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:249 in forward, code: x_user * self.act(x_user), | |
| mul_6: "f32[4096, 2304][2304, 1]cuda:0" = output_6 * sigmoid_6; output_6 = sigmoid_6 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:321 in forward, code: return self.activation(x) | |
| sigmoid_7: "f32[4096, 2304][2304, 1]cuda:0" = torch.sigmoid(output_7) | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:250 in forward, code: x_ads * self.act(x_ads), | |
| mul_7: "f32[4096, 2304][2304, 1]cuda:0" = output_7 * sigmoid_7; output_7 = sigmoid_7 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:515 in <lambda>, code: res_shared = self.user_compute(lambda: self.linear_shared(x_user)) | |
| linear_12: "bf16[4096, 9216][9216, 1]cuda:0" = torch._C._nn.linear(mul_6, l_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_fcs_modules_2_modules_linear_shared_parameters_weight_, None); mul_6 = l_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_fcs_modules_2_modules_linear_shared_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:516 in forward_linear, code: res_user = res_shared[:, : self.num_out_user] | |
| res_user_12: "bf16[4096, 4608][9216, 1]cuda:0" = linear_12[(slice(None, None, None), slice(None, 4608, None))] | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| linear_13: "bf16[4096, 4608][4608, 1]cuda:0" = torch._C._nn.linear(mul_7, l_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_fcs_modules_2_modules_linear_ads_parameters_weight_, None); mul_7 = l_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_fcs_modules_2_modules_linear_ads_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:521 in <lambda>, code: self.broadcast(res_shared[:, self.num_out_user :], ads_to_user_map), | |
| getitem_18: "bf16[4096, 4608][9216, 1]cuda:0" = linear_12[(slice(None, None, None), slice(4608, None, None))]; linear_12 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/common.py:64 in forward, code: return torch.index_select(x, dim=0, index=batch_indices) | |
| index_select_9: "bf16[4096, 4608][4608, 1]cuda:0" = torch.index_select(getitem_18, dim = 0, index = dummy_ads_to_user_map); getitem_18 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| res_ads_12: "bf16[4096, 4608][4608, 1]cuda:0" = linear_13 + index_select_9; linear_13 = index_select_9 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:131 in __add__, code: res_user = self.user + other.user | |
| res_user_13: "f32[4096, 4608][4608, 1]cuda:0" = mul_4 + res_user_12; mul_4 = res_user_12 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:132 in __add__, code: res_ads = self.ads + other.ads | |
| res_ads_13: "f32[4096, 4608][4608, 1]cuda:0" = mul_5 + res_ads_12; mul_5 = res_ads_12 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torch/nn/functional.py:2958 in rms_norm, code: return torch.rms_norm(input, normalized_shape, weight, eps) | |
| output_8: "f32[4096, 4608][4608, 1]cuda:0" = torch.rms_norm(res_user_13, (4608,), l_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_acts_modules_2_modules_activation_modules_ln_user_modules_ln_parameters_weight_, None); res_user_13 = l_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_acts_modules_2_modules_activation_modules_ln_user_modules_ln_parameters_weight_ = None | |
| output_9: "f32[4096, 4608][4608, 1]cuda:0" = torch.rms_norm(res_ads_13, (4608,), l_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_acts_modules_2_modules_activation_modules_ln_ads_modules_ln_parameters_weight_, None); res_ads_13 = l_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_acts_modules_2_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:321 in forward, code: return self.activation(x) | |
| sigmoid_8: "f32[4096, 4608][4608, 1]cuda:0" = torch.sigmoid(output_8) | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:249 in forward, code: x_user * self.act(x_user), | |
| mul_8: "f32[4096, 4608][4608, 1]cuda:0" = output_8 * sigmoid_8; output_8 = sigmoid_8 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:321 in forward, code: return self.activation(x) | |
| sigmoid_9: "f32[4096, 4608][4608, 1]cuda:0" = torch.sigmoid(output_9) | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:250 in forward, code: x_ads * self.act(x_ads), | |
| mul_9: "f32[4096, 4608][4608, 1]cuda:0" = output_9 * sigmoid_9; output_9 = sigmoid_9 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:515 in <lambda>, code: res_shared = self.user_compute(lambda: self.linear_shared(x_user)) | |
| linear_14: "bf16[4096, 3072][3072, 1]cuda:0" = torch._C._nn.linear(mul_8, l_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_fcs_modules_3_modules_linear_shared_parameters_weight_, None); l_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_fcs_modules_3_modules_linear_shared_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:516 in forward_linear, code: res_user = res_shared[:, : self.num_out_user] | |
| res_user_14: "bf16[4096, 1536][3072, 1]cuda:0" = linear_14[(slice(None, None, None), slice(None, 1536, None))] | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| linear_15: "bf16[4096, 1536][1536, 1]cuda:0" = torch._C._nn.linear(mul_9, l_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_fcs_modules_3_modules_linear_ads_parameters_weight_, None); l_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_fcs_modules_3_modules_linear_ads_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:521 in <lambda>, code: self.broadcast(res_shared[:, self.num_out_user :], ads_to_user_map), | |
| getitem_20: "bf16[4096, 1536][3072, 1]cuda:0" = linear_14[(slice(None, None, None), slice(1536, None, None))]; linear_14 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/common.py:64 in forward, code: return torch.index_select(x, dim=0, index=batch_indices) | |
| index_select_10: "bf16[4096, 1536][1536, 1]cuda:0" = torch.index_select(getitem_20, dim = 0, index = dummy_ads_to_user_map); getitem_20 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| res_ads_14: "bf16[4096, 1536][1536, 1]cuda:0" = linear_15 + index_select_10; linear_15 = index_select_10 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torch/nn/functional.py:2958 in rms_norm, code: return torch.rms_norm(input, normalized_shape, weight, eps) | |
| output_10: "f32[4096, 1536][1536, 1]cuda:0" = torch.rms_norm(res_user_14, (1536,), l_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_acts_modules_3_modules_activation_modules_ln_user_modules_ln_parameters_weight_, None); res_user_14 = l_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_acts_modules_3_modules_activation_modules_ln_user_modules_ln_parameters_weight_ = None | |
| output_11: "f32[4096, 1536][1536, 1]cuda:0" = torch.rms_norm(res_ads_14, (1536,), l_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_acts_modules_3_modules_activation_modules_ln_ads_modules_ln_parameters_weight_, None); res_ads_14 = l_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_acts_modules_3_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:321 in forward, code: return self.activation(x) | |
| sigmoid_10: "f32[4096, 1536][1536, 1]cuda:0" = torch.sigmoid(output_10) | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:249 in forward, code: x_user * self.act(x_user), | |
| mul_10: "f32[4096, 1536][1536, 1]cuda:0" = output_10 * sigmoid_10; output_10 = sigmoid_10 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:321 in forward, code: return self.activation(x) | |
| sigmoid_11: "f32[4096, 1536][1536, 1]cuda:0" = torch.sigmoid(output_11) | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:250 in forward, code: x_ads * self.act(x_ads), | |
| mul_11: "f32[4096, 1536][1536, 1]cuda:0" = output_11 * sigmoid_11; output_11 = sigmoid_11 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:515 in <lambda>, code: res_shared = self.user_compute(lambda: self.linear_shared(x_user)) | |
| linear_16: "bf16[4096, 9216][9216, 1]cuda:0" = torch._C._nn.linear(mul_10, l_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_fcs_modules_4_modules_linear_shared_parameters_weight_, None); mul_10 = l_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_fcs_modules_4_modules_linear_shared_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:516 in forward_linear, code: res_user = res_shared[:, : self.num_out_user] | |
| res_user_15: "bf16[4096, 4608][9216, 1]cuda:0" = linear_16[(slice(None, None, None), slice(None, 4608, None))] | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| linear_17: "bf16[4096, 4608][4608, 1]cuda:0" = torch._C._nn.linear(mul_11, l_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_fcs_modules_4_modules_linear_ads_parameters_weight_, None); mul_11 = l_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_fcs_modules_4_modules_linear_ads_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:521 in <lambda>, code: self.broadcast(res_shared[:, self.num_out_user :], ads_to_user_map), | |
| getitem_22: "bf16[4096, 4608][9216, 1]cuda:0" = linear_16[(slice(None, None, None), slice(4608, None, None))]; linear_16 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/common.py:64 in forward, code: return torch.index_select(x, dim=0, index=batch_indices) | |
| index_select_11: "bf16[4096, 4608][4608, 1]cuda:0" = torch.index_select(getitem_22, dim = 0, index = dummy_ads_to_user_map); getitem_22 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| res_ads_15: "bf16[4096, 4608][4608, 1]cuda:0" = linear_17 + index_select_11; linear_17 = index_select_11 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:131 in __add__, code: res_user = self.user + other.user | |
| res_user_16: "f32[4096, 4608][4608, 1]cuda:0" = mul_8 + res_user_15; mul_8 = res_user_15 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:132 in __add__, code: res_ads = self.ads + other.ads | |
| res_ads_16: "f32[4096, 4608][4608, 1]cuda:0" = mul_9 + res_ads_15; mul_9 = res_ads_15 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torch/nn/functional.py:2958 in rms_norm, code: return torch.rms_norm(input, normalized_shape, weight, eps) | |
| output_12: "f32[4096, 4608][4608, 1]cuda:0" = torch.rms_norm(res_user_16, (4608,), l_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_acts_modules_4_modules_activation_modules_ln_user_modules_ln_parameters_weight_, None); res_user_16 = l_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_acts_modules_4_modules_activation_modules_ln_user_modules_ln_parameters_weight_ = None | |
| output_13: "f32[4096, 4608][4608, 1]cuda:0" = torch.rms_norm(res_ads_16, (4608,), l_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_acts_modules_4_modules_activation_modules_ln_ads_modules_ln_parameters_weight_, None); res_ads_16 = l_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_acts_modules_4_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:321 in forward, code: return self.activation(x) | |
| sigmoid_12: "f32[4096, 4608][4608, 1]cuda:0" = torch.sigmoid(output_12) | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:249 in forward, code: x_user * self.act(x_user), | |
| mul_12: "f32[4096, 4608][4608, 1]cuda:0" = output_12 * sigmoid_12; output_12 = sigmoid_12 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:321 in forward, code: return self.activation(x) | |
| sigmoid_13: "f32[4096, 4608][4608, 1]cuda:0" = torch.sigmoid(output_13) | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:250 in forward, code: x_ads * self.act(x_ads), | |
| mul_13: "f32[4096, 4608][4608, 1]cuda:0" = output_13 * sigmoid_13; output_13 = sigmoid_13 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:515 in <lambda>, code: res_shared = self.user_compute(lambda: self.linear_shared(x_user)) | |
| linear_18: "bf16[4096, 12288][12288, 1]cuda:0" = torch._C._nn.linear(mul_12, l_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_output_fc_modules_linear_shared_parameters_weight_, None); l_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_output_fc_modules_linear_shared_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:516 in forward_linear, code: res_user = res_shared[:, : self.num_out_user] | |
| res_user_17: "bf16[4096, 6144][12288, 1]cuda:0" = linear_18[(slice(None, None, None), slice(None, 6144, None))] | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| linear_19: "bf16[4096, 6144][6144, 1]cuda:0" = torch._C._nn.linear(mul_13, l_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_output_fc_modules_linear_ads_parameters_weight_, None); l_self_modules_wukong_modules_layers_modules_0_modules_bitmlp_modules_output_fc_modules_linear_ads_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:521 in <lambda>, code: self.broadcast(res_shared[:, self.num_out_user :], ads_to_user_map), | |
| getitem_24: "bf16[4096, 6144][12288, 1]cuda:0" = linear_18[(slice(None, None, None), slice(6144, None, None))]; linear_18 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/common.py:64 in forward, code: return torch.index_select(x, dim=0, index=batch_indices) | |
| index_select_12: "bf16[4096, 6144][6144, 1]cuda:0" = torch.index_select(getitem_24, dim = 0, index = dummy_ads_to_user_map); getitem_24 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| res_ads_17: "bf16[4096, 6144][6144, 1]cuda:0" = linear_19 + index_select_12; linear_19 = index_select_12 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:181 in view_2d_to_3d, code: self.user.view(self.user.shape[0], num_emb, emb_dim), | |
| view_6: "bf16[4096, 48, 128][12288, 128, 1]cuda:0" = res_user_17.view(4096, -1, 128); res_user_17 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:182 in view_2d_to_3d, code: self.ads.view(self.ads.shape[0], num_emb, emb_dim), | |
| view_7: "bf16[4096, 48, 128][6144, 128, 1]cuda:0" = res_ads_17.view(4096, -1, 128); res_ads_17 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:488 in <lambda>, code: lambda: self.linear_shared.weight @ x_user | |
| matmul_10: "bf16[4096, 192, 128][24576, 128, 1]cuda:0" = l_self_modules_wukong_modules_layers_modules_0_modules_post_snn_lce_modules_fc_modules_linear_shared_parameters_weight_ @ view_6; l_self_modules_wukong_modules_layers_modules_0_modules_post_snn_lce_modules_fc_modules_linear_shared_parameters_weight_ = view_6 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:490 in forward_lce, code: res_user = res_shared[:, : self.num_out_user] | |
| res_user_18: "bf16[4096, 96, 128][24576, 128, 1]cuda:0" = matmul_10[(slice(None, None, None), slice(None, 96, None))] | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:493 in <lambda>, code: lambda: self.linear_ads.weight @ x_ads | |
| matmul_11: "bf16[4096, 96, 128][12288, 128, 1]cuda:0" = l_self_modules_wukong_modules_layers_modules_0_modules_post_snn_lce_modules_fc_modules_linear_ads_parameters_weight_ @ view_7; l_self_modules_wukong_modules_layers_modules_0_modules_post_snn_lce_modules_fc_modules_linear_ads_parameters_weight_ = view_7 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:496 in <lambda>, code: res_shared[:, self.num_out_user :], ads_to_user_map | |
| getitem_26: "bf16[4096, 96, 128][24576, 128, 1]cuda:0" = matmul_10[(slice(None, None, None), slice(96, None, None))]; matmul_10 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/common.py:64 in forward, code: return torch.index_select(x, dim=0, index=batch_indices) | |
| index_select_13: "bf16[4096, 96, 128][12288, 128, 1]cuda:0" = torch.index_select(getitem_26, dim = 0, index = dummy_ads_to_user_map); getitem_26 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:493 in <lambda>, code: lambda: self.linear_ads.weight @ x_ads | |
| res_ads_18: "bf16[4096, 96, 128][12288, 128, 1]cuda:0" = matmul_11 + index_select_13; matmul_11 = index_select_13 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:131 in __add__, code: res_user = self.user + other.user | |
| res_user_19: "bf16[4096, 96, 128][12288, 128, 1]cuda:0" = res_user_18 + res_user_2; res_user_18 = res_user_2 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:132 in __add__, code: res_ads = self.ads + other.ads | |
| res_ads_19: "bf16[4096, 96, 128][12288, 128, 1]cuda:0" = res_ads_18 + res_ads_2; res_ads_18 = res_ads_2 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:208 in cat, code: torch.cat([t.user for t in tensors], dim=1), | |
| cat_5: "bf16[4096, 192, 128][24576, 128, 1]cuda:0" = torch.cat([res_user_19, res_user_4], dim = 1); res_user_19 = res_user_4 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:209 in cat, code: torch.cat([t.ads for t in tensors], dim=1), | |
| cat_6: "bf16[4096, 192, 128][24576, 128, 1]cuda:0" = torch.cat([res_ads_19, res_ads_4], dim = 1); res_ads_19 = res_ads_4 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torch/nn/functional.py:2958 in rms_norm, code: return torch.rms_norm(input, normalized_shape, weight, eps) | |
| res_user_20: "f32[4096, 192, 128][24576, 128, 1]cuda:0" = torch.rms_norm(cat_5, (128,), l_self_modules_wukong_modules_layers_modules_0_modules_ln2_modules_ln_user_parameters_weight_, None); cat_5 = l_self_modules_wukong_modules_layers_modules_0_modules_ln2_modules_ln_user_parameters_weight_ = None | |
| res_ads_20: "f32[4096, 192, 128][24576, 128, 1]cuda:0" = torch.rms_norm(cat_6, (128,), l_self_modules_wukong_modules_layers_modules_0_modules_ln2_modules_ln_ads_parameters_weight_, None); cat_6 = l_self_modules_wukong_modules_layers_modules_0_modules_ln2_modules_ln_ads_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:488 in <lambda>, code: lambda: self.linear_shared.weight @ x_user | |
| matmul_12: "bf16[4096, 192, 128][24576, 128, 1]cuda:0" = l_self_modules_wukong_modules_layers_modules_1_modules_maybe_fused_lce_modules_lces_modules_0_modules_fc_modules_linear_shared_parameters_weight_ @ res_user_20; l_self_modules_wukong_modules_layers_modules_1_modules_maybe_fused_lce_modules_lces_modules_0_modules_fc_modules_linear_shared_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:490 in forward_lce, code: res_user = res_shared[:, : self.num_out_user] | |
| res_user_21: "bf16[4096, 96, 128][24576, 128, 1]cuda:0" = matmul_12[(slice(None, None, None), slice(None, 96, None))] | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:493 in <lambda>, code: lambda: self.linear_ads.weight @ x_ads | |
| matmul_13: "bf16[4096, 96, 128][12288, 128, 1]cuda:0" = l_self_modules_wukong_modules_layers_modules_1_modules_maybe_fused_lce_modules_lces_modules_0_modules_fc_modules_linear_ads_parameters_weight_ @ res_ads_20; l_self_modules_wukong_modules_layers_modules_1_modules_maybe_fused_lce_modules_lces_modules_0_modules_fc_modules_linear_ads_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:496 in <lambda>, code: res_shared[:, self.num_out_user :], ads_to_user_map | |
| getitem_28: "bf16[4096, 96, 128][24576, 128, 1]cuda:0" = matmul_12[(slice(None, None, None), slice(96, None, None))]; matmul_12 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/common.py:64 in forward, code: return torch.index_select(x, dim=0, index=batch_indices) | |
| index_select_14: "bf16[4096, 96, 128][12288, 128, 1]cuda:0" = torch.index_select(getitem_28, dim = 0, index = dummy_ads_to_user_map); getitem_28 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:493 in <lambda>, code: lambda: self.linear_ads.weight @ x_ads | |
| res_ads_21: "bf16[4096, 96, 128][12288, 128, 1]cuda:0" = matmul_13 + index_select_14; matmul_13 = index_select_14 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:488 in <lambda>, code: lambda: self.linear_shared.weight @ x_user | |
| matmul_14: "bf16[4096, 80, 128][10240, 128, 1]cuda:0" = l_self_modules_wukong_modules_layers_modules_1_modules_maybe_fused_lce_modules_lces_modules_1_modules_fc_modules_linear_shared_parameters_weight_ @ res_user_20; l_self_modules_wukong_modules_layers_modules_1_modules_maybe_fused_lce_modules_lces_modules_1_modules_fc_modules_linear_shared_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:490 in forward_lce, code: res_user = res_shared[:, : self.num_out_user] | |
| res_user_22: "bf16[4096, 40, 128][10240, 128, 1]cuda:0" = matmul_14[(slice(None, None, None), slice(None, 40, None))] | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:493 in <lambda>, code: lambda: self.linear_ads.weight @ x_ads | |
| matmul_15: "bf16[4096, 40, 128][5120, 128, 1]cuda:0" = l_self_modules_wukong_modules_layers_modules_1_modules_maybe_fused_lce_modules_lces_modules_1_modules_fc_modules_linear_ads_parameters_weight_ @ res_ads_20; l_self_modules_wukong_modules_layers_modules_1_modules_maybe_fused_lce_modules_lces_modules_1_modules_fc_modules_linear_ads_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:496 in <lambda>, code: res_shared[:, self.num_out_user :], ads_to_user_map | |
| getitem_30: "bf16[4096, 40, 128][10240, 128, 1]cuda:0" = matmul_14[(slice(None, None, None), slice(40, None, None))]; matmul_14 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/common.py:64 in forward, code: return torch.index_select(x, dim=0, index=batch_indices) | |
| index_select_15: "bf16[4096, 40, 128][5120, 128, 1]cuda:0" = torch.index_select(getitem_30, dim = 0, index = dummy_ads_to_user_map); getitem_30 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:493 in <lambda>, code: lambda: self.linear_ads.weight @ x_ads | |
| res_ads_22: "bf16[4096, 40, 128][5120, 128, 1]cuda:0" = matmul_15 + index_select_15; matmul_15 = index_select_15 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:488 in <lambda>, code: lambda: self.linear_shared.weight @ x_user | |
| matmul_16: "bf16[4096, 128, 128][16384, 128, 1]cuda:0" = l_self_modules_wukong_modules_layers_modules_1_modules_maybe_fused_lce_modules_lces_modules_2_modules_fc_modules_linear_shared_parameters_weight_ @ res_user_20; l_self_modules_wukong_modules_layers_modules_1_modules_maybe_fused_lce_modules_lces_modules_2_modules_fc_modules_linear_shared_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:490 in forward_lce, code: res_user = res_shared[:, : self.num_out_user] | |
| res_user_23: "bf16[4096, 64, 128][16384, 128, 1]cuda:0" = matmul_16[(slice(None, None, None), slice(None, 64, None))] | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:493 in <lambda>, code: lambda: self.linear_ads.weight @ x_ads | |
| matmul_17: "bf16[4096, 64, 128][8192, 128, 1]cuda:0" = l_self_modules_wukong_modules_layers_modules_1_modules_maybe_fused_lce_modules_lces_modules_2_modules_fc_modules_linear_ads_parameters_weight_ @ res_ads_20; l_self_modules_wukong_modules_layers_modules_1_modules_maybe_fused_lce_modules_lces_modules_2_modules_fc_modules_linear_ads_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:496 in <lambda>, code: res_shared[:, self.num_out_user :], ads_to_user_map | |
| getitem_32: "bf16[4096, 64, 128][16384, 128, 1]cuda:0" = matmul_16[(slice(None, None, None), slice(64, None, None))]; matmul_16 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/common.py:64 in forward, code: return torch.index_select(x, dim=0, index=batch_indices) | |
| index_select_16: "bf16[4096, 64, 128][8192, 128, 1]cuda:0" = torch.index_select(getitem_32, dim = 0, index = dummy_ads_to_user_map); getitem_32 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:493 in <lambda>, code: lambda: self.linear_ads.weight @ x_ads | |
| res_ads_23: "bf16[4096, 64, 128][8192, 128, 1]cuda:0" = matmul_17 + index_select_16; matmul_17 = index_select_16 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:189 in view_3d_to_2d, code: self.user.view(self.user.shape[0], emb_dim), | |
| view_8: "bf16[4096, 5120][10240, 1]cuda:0" = res_user_22.view(4096, -1); res_user_22 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:190 in view_3d_to_2d, code: self.ads.view(self.ads.shape[0], emb_dim), | |
| view_9: "bf16[4096, 5120][5120, 1]cuda:0" = res_ads_22.view(4096, -1); res_ads_22 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:515 in <lambda>, code: res_shared = self.user_compute(lambda: self.linear_shared(x_user)) | |
| linear_20: "bf16[4096, 1024][1024, 1]cuda:0" = torch._C._nn.linear(view_8, l_self_modules_wukong_modules_layers_modules_1_modules_bitattn_modules_v_proj_modules_fcs_modules_0_modules_linear_shared_parameters_weight_, None); view_8 = l_self_modules_wukong_modules_layers_modules_1_modules_bitattn_modules_v_proj_modules_fcs_modules_0_modules_linear_shared_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:516 in forward_linear, code: res_user = res_shared[:, : self.num_out_user] | |
| res_user_24: "bf16[4096, 512][1024, 1]cuda:0" = linear_20[(slice(None, None, None), slice(None, 512, None))] | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| linear_21: "bf16[4096, 512][512, 1]cuda:0" = torch._C._nn.linear(view_9, l_self_modules_wukong_modules_layers_modules_1_modules_bitattn_modules_v_proj_modules_fcs_modules_0_modules_linear_ads_parameters_weight_, None); view_9 = l_self_modules_wukong_modules_layers_modules_1_modules_bitattn_modules_v_proj_modules_fcs_modules_0_modules_linear_ads_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:521 in <lambda>, code: self.broadcast(res_shared[:, self.num_out_user :], ads_to_user_map), | |
| getitem_34: "bf16[4096, 512][1024, 1]cuda:0" = linear_20[(slice(None, None, None), slice(512, None, None))]; linear_20 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/common.py:64 in forward, code: return torch.index_select(x, dim=0, index=batch_indices) | |
| index_select_17: "bf16[4096, 512][512, 1]cuda:0" = torch.index_select(getitem_34, dim = 0, index = dummy_ads_to_user_map); getitem_34 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| res_ads_24: "bf16[4096, 512][512, 1]cuda:0" = linear_21 + index_select_17; linear_21 = index_select_17 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torch/nn/functional.py:2958 in rms_norm, code: return torch.rms_norm(input, normalized_shape, weight, eps) | |
| output_14: "f32[4096, 512][512, 1]cuda:0" = torch.rms_norm(res_user_24, (512,), l_self_modules_wukong_modules_layers_modules_1_modules_bitattn_modules_v_proj_modules_acts_modules_0_modules_activation_modules_ln_user_modules_ln_parameters_weight_, None); res_user_24 = l_self_modules_wukong_modules_layers_modules_1_modules_bitattn_modules_v_proj_modules_acts_modules_0_modules_activation_modules_ln_user_modules_ln_parameters_weight_ = None | |
| output_15: "f32[4096, 512][512, 1]cuda:0" = torch.rms_norm(res_ads_24, (512,), l_self_modules_wukong_modules_layers_modules_1_modules_bitattn_modules_v_proj_modules_acts_modules_0_modules_activation_modules_ln_ads_modules_ln_parameters_weight_, None); res_ads_24 = l_self_modules_wukong_modules_layers_modules_1_modules_bitattn_modules_v_proj_modules_acts_modules_0_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:321 in forward, code: return self.activation(x) | |
| sigmoid_14: "f32[4096, 512][512, 1]cuda:0" = torch.sigmoid(output_14) | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:249 in forward, code: x_user * self.act(x_user), | |
| mul_14: "f32[4096, 512][512, 1]cuda:0" = output_14 * sigmoid_14; output_14 = sigmoid_14 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:321 in forward, code: return self.activation(x) | |
| sigmoid_15: "f32[4096, 512][512, 1]cuda:0" = torch.sigmoid(output_15) | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:250 in forward, code: x_ads * self.act(x_ads), | |
| mul_15: "f32[4096, 512][512, 1]cuda:0" = output_15 * sigmoid_15; output_15 = sigmoid_15 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:515 in <lambda>, code: res_shared = self.user_compute(lambda: self.linear_shared(x_user)) | |
| linear_22: "bf16[4096, 1024][1024, 1]cuda:0" = torch._C._nn.linear(mul_14, l_self_modules_wukong_modules_layers_modules_1_modules_bitattn_modules_v_proj_modules_fcs_modules_1_modules_linear_shared_parameters_weight_, None); mul_14 = l_self_modules_wukong_modules_layers_modules_1_modules_bitattn_modules_v_proj_modules_fcs_modules_1_modules_linear_shared_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:516 in forward_linear, code: res_user = res_shared[:, : self.num_out_user] | |
| res_user_25: "bf16[4096, 512][1024, 1]cuda:0" = linear_22[(slice(None, None, None), slice(None, 512, None))] | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| linear_23: "bf16[4096, 512][512, 1]cuda:0" = torch._C._nn.linear(mul_15, l_self_modules_wukong_modules_layers_modules_1_modules_bitattn_modules_v_proj_modules_fcs_modules_1_modules_linear_ads_parameters_weight_, None); mul_15 = l_self_modules_wukong_modules_layers_modules_1_modules_bitattn_modules_v_proj_modules_fcs_modules_1_modules_linear_ads_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:521 in <lambda>, code: self.broadcast(res_shared[:, self.num_out_user :], ads_to_user_map), | |
| getitem_36: "bf16[4096, 512][1024, 1]cuda:0" = linear_22[(slice(None, None, None), slice(512, None, None))]; linear_22 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/common.py:64 in forward, code: return torch.index_select(x, dim=0, index=batch_indices) | |
| index_select_18: "bf16[4096, 512][512, 1]cuda:0" = torch.index_select(getitem_36, dim = 0, index = dummy_ads_to_user_map); getitem_36 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| res_ads_25: "bf16[4096, 512][512, 1]cuda:0" = linear_23 + index_select_18; linear_23 = index_select_18 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torch/nn/functional.py:2958 in rms_norm, code: return torch.rms_norm(input, normalized_shape, weight, eps) | |
| output_16: "f32[4096, 512][512, 1]cuda:0" = torch.rms_norm(res_user_25, (512,), l_self_modules_wukong_modules_layers_modules_1_modules_bitattn_modules_v_proj_modules_acts_modules_1_modules_activation_modules_ln_user_modules_ln_parameters_weight_, None); res_user_25 = l_self_modules_wukong_modules_layers_modules_1_modules_bitattn_modules_v_proj_modules_acts_modules_1_modules_activation_modules_ln_user_modules_ln_parameters_weight_ = None | |
| output_17: "f32[4096, 512][512, 1]cuda:0" = torch.rms_norm(res_ads_25, (512,), l_self_modules_wukong_modules_layers_modules_1_modules_bitattn_modules_v_proj_modules_acts_modules_1_modules_activation_modules_ln_ads_modules_ln_parameters_weight_, None); res_ads_25 = l_self_modules_wukong_modules_layers_modules_1_modules_bitattn_modules_v_proj_modules_acts_modules_1_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:321 in forward, code: return self.activation(x) | |
| sigmoid_16: "f32[4096, 512][512, 1]cuda:0" = torch.sigmoid(output_16) | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:249 in forward, code: x_user * self.act(x_user), | |
| mul_16: "f32[4096, 512][512, 1]cuda:0" = output_16 * sigmoid_16; output_16 = sigmoid_16 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:321 in forward, code: return self.activation(x) | |
| sigmoid_17: "f32[4096, 512][512, 1]cuda:0" = torch.sigmoid(output_17) | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:250 in forward, code: x_ads * self.act(x_ads), | |
| mul_17: "f32[4096, 512][512, 1]cuda:0" = output_17 * sigmoid_17; output_17 = sigmoid_17 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:515 in <lambda>, code: res_shared = self.user_compute(lambda: self.linear_shared(x_user)) | |
| linear_24: "bf16[4096, 30720][30720, 1]cuda:0" = torch._C._nn.linear(mul_16, l_self_modules_wukong_modules_layers_modules_1_modules_bitattn_modules_v_proj_modules_output_fc_modules_linear_shared_parameters_weight_, None); mul_16 = l_self_modules_wukong_modules_layers_modules_1_modules_bitattn_modules_v_proj_modules_output_fc_modules_linear_shared_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:516 in forward_linear, code: res_user = res_shared[:, : self.num_out_user] | |
| res_user_26: "bf16[4096, 15360][30720, 1]cuda:0" = linear_24[(slice(None, None, None), slice(None, 15360, None))] | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| linear_25: "bf16[4096, 15360][15360, 1]cuda:0" = torch._C._nn.linear(mul_17, l_self_modules_wukong_modules_layers_modules_1_modules_bitattn_modules_v_proj_modules_output_fc_modules_linear_ads_parameters_weight_, None); mul_17 = l_self_modules_wukong_modules_layers_modules_1_modules_bitattn_modules_v_proj_modules_output_fc_modules_linear_ads_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:521 in <lambda>, code: self.broadcast(res_shared[:, self.num_out_user :], ads_to_user_map), | |
| getitem_38: "bf16[4096, 15360][30720, 1]cuda:0" = linear_24[(slice(None, None, None), slice(15360, None, None))]; linear_24 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/common.py:64 in forward, code: return torch.index_select(x, dim=0, index=batch_indices) | |
| index_select_19: "bf16[4096, 15360][15360, 1]cuda:0" = torch.index_select(getitem_38, dim = 0, index = dummy_ads_to_user_map); getitem_38 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| res_ads_26: "bf16[4096, 15360][15360, 1]cuda:0" = linear_25 + index_select_19; linear_25 = index_select_19 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:181 in view_2d_to_3d, code: self.user.view(self.user.shape[0], num_emb, emb_dim), | |
| view_10: "bf16[4096, 40, 384][30720, 384, 1]cuda:0" = res_user_26.view(4096, -1, 384); res_user_26 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:182 in view_2d_to_3d, code: self.ads.view(self.ads.shape[0], num_emb, emb_dim), | |
| view_11: "bf16[4096, 40, 384][15360, 384, 1]cuda:0" = res_ads_26.view(4096, -1, 384); res_ads_26 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/common.py:64 in forward, code: return torch.index_select(x, dim=0, index=batch_indices) | |
| index_select_20: "f32[4096, 192, 128][24576, 128, 1]cuda:0" = torch.index_select(res_user_20, dim = 0, index = dummy_ads_to_user_map) | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:980 in forward, code: x_all = torch.cat( | |
| cat_7: "f32[4096, 384, 128][49152, 128, 1]cuda:0" = torch.cat([index_select_20, res_ads_20], dim = 1); index_select_20 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:989 in <lambda>, code: lambda: v_ads @ x_all | |
| matmul_18: "bf16[4096, 40, 128][5120, 128, 1]cuda:0" = view_11 @ cat_7; view_11 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:993 in <lambda>, code: lambda: v_user[:, :, :num_emb_user] @ x.user | |
| getitem_39: "bf16[4096, 40, 192][30720, 384, 1]cuda:0" = view_10[(slice(None, None, None), slice(None, None, None), slice(None, 192, None))]; view_10 = None | |
| matmul_19: "bf16[4096, 40, 128][5120, 128, 1]cuda:0" = getitem_39 @ res_user_20; getitem_39 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:1006 in <lambda>, code: lambda: vx_ads @ x_all.permute(0, 2, 1) | |
| permute_2: "f32[4096, 128, 384][49152, 1, 128]cuda:0" = cat_7.permute(0, 2, 1); cat_7 = None | |
| res_ads_27: "bf16[4096, 40, 384][15360, 384, 1]cuda:0" = matmul_18 @ permute_2; matmul_18 = permute_2 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:1010 in <lambda>, code: lambda: vx_user @ x.user.permute(0, 2, 1) | |
| permute_3: "f32[4096, 128, 192][24576, 1, 128]cuda:0" = res_user_20.permute(0, 2, 1) | |
| res_user_27: "bf16[4096, 40, 192][7680, 192, 1]cuda:0" = matmul_19 @ permute_3; matmul_19 = permute_3 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:189 in view_3d_to_2d, code: self.user.view(self.user.shape[0], emb_dim), | |
| view_12: "bf16[4096, 7680][7680, 1]cuda:0" = res_user_27.view(4096, -1); res_user_27 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:190 in view_3d_to_2d, code: self.ads.view(self.ads.shape[0], emb_dim), | |
| view_13: "bf16[4096, 15360][15360, 1]cuda:0" = res_ads_27.view(4096, -1); res_ads_27 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:208 in cat, code: torch.cat([t.user for t in tensors], dim=1), | |
| cat_8: "f32[4096, 11776][11776, 1]cuda:0" = torch.cat([view_12, res_user_1], dim = 1); view_12 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:209 in cat, code: torch.cat([t.ads for t in tensors], dim=1), | |
| cat_9: "f32[4096, 19456][19456, 1]cuda:0" = torch.cat([view_13, res_ads_1], dim = 1); view_13 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torch/nn/functional.py:2958 in rms_norm, code: return torch.rms_norm(input, normalized_shape, weight, eps) | |
| res_user_28: "f32[4096, 11776][11776, 1]cuda:0" = torch.rms_norm(cat_8, (11776,), l_self_modules_wukong_modules_layers_modules_1_modules_ln1_modules_ln_user_parameters_weight_, None); cat_8 = l_self_modules_wukong_modules_layers_modules_1_modules_ln1_modules_ln_user_parameters_weight_ = None | |
| res_ads_28: "f32[4096, 19456][19456, 1]cuda:0" = torch.rms_norm(cat_9, (19456,), l_self_modules_wukong_modules_layers_modules_1_modules_ln1_modules_ln_ads_parameters_weight_, None); cat_9 = l_self_modules_wukong_modules_layers_modules_1_modules_ln1_modules_ln_ads_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:515 in <lambda>, code: res_shared = self.user_compute(lambda: self.linear_shared(x_user)) | |
| linear_26: "bf16[4096, 4608][4608, 1]cuda:0" = torch._C._nn.linear(res_user_28, l_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_fcs_modules_0_modules_linear_shared_parameters_weight_, None); res_user_28 = l_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_fcs_modules_0_modules_linear_shared_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:516 in forward_linear, code: res_user = res_shared[:, : self.num_out_user] | |
| res_user_29: "bf16[4096, 2304][4608, 1]cuda:0" = linear_26[(slice(None, None, None), slice(None, 2304, None))] | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| linear_27: "bf16[4096, 2304][2304, 1]cuda:0" = torch._C._nn.linear(res_ads_28, l_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_fcs_modules_0_modules_linear_ads_parameters_weight_, None); res_ads_28 = l_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_fcs_modules_0_modules_linear_ads_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:521 in <lambda>, code: self.broadcast(res_shared[:, self.num_out_user :], ads_to_user_map), | |
| getitem_41: "bf16[4096, 2304][4608, 1]cuda:0" = linear_26[(slice(None, None, None), slice(2304, None, None))]; linear_26 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/common.py:64 in forward, code: return torch.index_select(x, dim=0, index=batch_indices) | |
| index_select_21: "bf16[4096, 2304][2304, 1]cuda:0" = torch.index_select(getitem_41, dim = 0, index = dummy_ads_to_user_map); getitem_41 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| res_ads_29: "bf16[4096, 2304][2304, 1]cuda:0" = linear_27 + index_select_21; linear_27 = index_select_21 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torch/nn/functional.py:2958 in rms_norm, code: return torch.rms_norm(input, normalized_shape, weight, eps) | |
| output_18: "f32[4096, 2304][2304, 1]cuda:0" = torch.rms_norm(res_user_29, (2304,), l_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_acts_modules_0_modules_activation_modules_ln_user_modules_ln_parameters_weight_, None); res_user_29 = l_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_acts_modules_0_modules_activation_modules_ln_user_modules_ln_parameters_weight_ = None | |
| output_19: "f32[4096, 2304][2304, 1]cuda:0" = torch.rms_norm(res_ads_29, (2304,), l_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_acts_modules_0_modules_activation_modules_ln_ads_modules_ln_parameters_weight_, None); res_ads_29 = l_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_acts_modules_0_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:321 in forward, code: return self.activation(x) | |
| sigmoid_18: "f32[4096, 2304][2304, 1]cuda:0" = torch.sigmoid(output_18) | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:249 in forward, code: x_user * self.act(x_user), | |
| mul_18: "f32[4096, 2304][2304, 1]cuda:0" = output_18 * sigmoid_18; output_18 = sigmoid_18 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:321 in forward, code: return self.activation(x) | |
| sigmoid_19: "f32[4096, 2304][2304, 1]cuda:0" = torch.sigmoid(output_19) | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:250 in forward, code: x_ads * self.act(x_ads), | |
| mul_19: "f32[4096, 2304][2304, 1]cuda:0" = output_19 * sigmoid_19; output_19 = sigmoid_19 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:515 in <lambda>, code: res_shared = self.user_compute(lambda: self.linear_shared(x_user)) | |
| linear_28: "bf16[4096, 2304][2304, 1]cuda:0" = torch._C._nn.linear(mul_18, l_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_fcs_modules_1_modules_linear_shared_parameters_weight_, None); l_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_fcs_modules_1_modules_linear_shared_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:516 in forward_linear, code: res_user = res_shared[:, : self.num_out_user] | |
| res_user_30: "bf16[4096, 1152][2304, 1]cuda:0" = linear_28[(slice(None, None, None), slice(None, 1152, None))] | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| linear_29: "bf16[4096, 1152][1152, 1]cuda:0" = torch._C._nn.linear(mul_19, l_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_fcs_modules_1_modules_linear_ads_parameters_weight_, None); l_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_fcs_modules_1_modules_linear_ads_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:521 in <lambda>, code: self.broadcast(res_shared[:, self.num_out_user :], ads_to_user_map), | |
| getitem_43: "bf16[4096, 1152][2304, 1]cuda:0" = linear_28[(slice(None, None, None), slice(1152, None, None))]; linear_28 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/common.py:64 in forward, code: return torch.index_select(x, dim=0, index=batch_indices) | |
| index_select_22: "bf16[4096, 1152][1152, 1]cuda:0" = torch.index_select(getitem_43, dim = 0, index = dummy_ads_to_user_map); getitem_43 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| res_ads_30: "bf16[4096, 1152][1152, 1]cuda:0" = linear_29 + index_select_22; linear_29 = index_select_22 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torch/nn/functional.py:2958 in rms_norm, code: return torch.rms_norm(input, normalized_shape, weight, eps) | |
| output_20: "f32[4096, 1152][1152, 1]cuda:0" = torch.rms_norm(res_user_30, (1152,), l_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_acts_modules_1_modules_activation_modules_ln_user_modules_ln_parameters_weight_, None); res_user_30 = l_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_acts_modules_1_modules_activation_modules_ln_user_modules_ln_parameters_weight_ = None | |
| output_21: "f32[4096, 1152][1152, 1]cuda:0" = torch.rms_norm(res_ads_30, (1152,), l_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_acts_modules_1_modules_activation_modules_ln_ads_modules_ln_parameters_weight_, None); res_ads_30 = l_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_acts_modules_1_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:321 in forward, code: return self.activation(x) | |
| sigmoid_20: "f32[4096, 1152][1152, 1]cuda:0" = torch.sigmoid(output_20) | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:249 in forward, code: x_user * self.act(x_user), | |
| mul_20: "f32[4096, 1152][1152, 1]cuda:0" = output_20 * sigmoid_20; output_20 = sigmoid_20 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:321 in forward, code: return self.activation(x) | |
| sigmoid_21: "f32[4096, 1152][1152, 1]cuda:0" = torch.sigmoid(output_21) | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:250 in forward, code: x_ads * self.act(x_ads), | |
| mul_21: "f32[4096, 1152][1152, 1]cuda:0" = output_21 * sigmoid_21; output_21 = sigmoid_21 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:515 in <lambda>, code: res_shared = self.user_compute(lambda: self.linear_shared(x_user)) | |
| linear_30: "bf16[4096, 4608][4608, 1]cuda:0" = torch._C._nn.linear(mul_20, l_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_fcs_modules_2_modules_linear_shared_parameters_weight_, None); mul_20 = l_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_fcs_modules_2_modules_linear_shared_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:516 in forward_linear, code: res_user = res_shared[:, : self.num_out_user] | |
| res_user_31: "bf16[4096, 2304][4608, 1]cuda:0" = linear_30[(slice(None, None, None), slice(None, 2304, None))] | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| linear_31: "bf16[4096, 2304][2304, 1]cuda:0" = torch._C._nn.linear(mul_21, l_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_fcs_modules_2_modules_linear_ads_parameters_weight_, None); mul_21 = l_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_fcs_modules_2_modules_linear_ads_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:521 in <lambda>, code: self.broadcast(res_shared[:, self.num_out_user :], ads_to_user_map), | |
| getitem_45: "bf16[4096, 2304][4608, 1]cuda:0" = linear_30[(slice(None, None, None), slice(2304, None, None))]; linear_30 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/common.py:64 in forward, code: return torch.index_select(x, dim=0, index=batch_indices) | |
| index_select_23: "bf16[4096, 2304][2304, 1]cuda:0" = torch.index_select(getitem_45, dim = 0, index = dummy_ads_to_user_map); getitem_45 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| res_ads_31: "bf16[4096, 2304][2304, 1]cuda:0" = linear_31 + index_select_23; linear_31 = index_select_23 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:131 in __add__, code: res_user = self.user + other.user | |
| res_user_32: "f32[4096, 2304][2304, 1]cuda:0" = mul_18 + res_user_31; mul_18 = res_user_31 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:132 in __add__, code: res_ads = self.ads + other.ads | |
| res_ads_32: "f32[4096, 2304][2304, 1]cuda:0" = mul_19 + res_ads_31; mul_19 = res_ads_31 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torch/nn/functional.py:2958 in rms_norm, code: return torch.rms_norm(input, normalized_shape, weight, eps) | |
| output_22: "f32[4096, 2304][2304, 1]cuda:0" = torch.rms_norm(res_user_32, (2304,), l_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_acts_modules_2_modules_activation_modules_ln_user_modules_ln_parameters_weight_, None); res_user_32 = l_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_acts_modules_2_modules_activation_modules_ln_user_modules_ln_parameters_weight_ = None | |
| output_23: "f32[4096, 2304][2304, 1]cuda:0" = torch.rms_norm(res_ads_32, (2304,), l_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_acts_modules_2_modules_activation_modules_ln_ads_modules_ln_parameters_weight_, None); res_ads_32 = l_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_acts_modules_2_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:321 in forward, code: return self.activation(x) | |
| sigmoid_22: "f32[4096, 2304][2304, 1]cuda:0" = torch.sigmoid(output_22) | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:249 in forward, code: x_user * self.act(x_user), | |
| mul_22: "f32[4096, 2304][2304, 1]cuda:0" = output_22 * sigmoid_22; output_22 = sigmoid_22 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:321 in forward, code: return self.activation(x) | |
| sigmoid_23: "f32[4096, 2304][2304, 1]cuda:0" = torch.sigmoid(output_23) | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:250 in forward, code: x_ads * self.act(x_ads), | |
| mul_23: "f32[4096, 2304][2304, 1]cuda:0" = output_23 * sigmoid_23; output_23 = sigmoid_23 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:515 in <lambda>, code: res_shared = self.user_compute(lambda: self.linear_shared(x_user)) | |
| linear_32: "bf16[4096, 1920][1920, 1]cuda:0" = torch._C._nn.linear(mul_22, l_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_fcs_modules_3_modules_linear_shared_parameters_weight_, None); l_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_fcs_modules_3_modules_linear_shared_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:516 in forward_linear, code: res_user = res_shared[:, : self.num_out_user] | |
| res_user_33: "bf16[4096, 960][1920, 1]cuda:0" = linear_32[(slice(None, None, None), slice(None, 960, None))] | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| linear_33: "bf16[4096, 960][960, 1]cuda:0" = torch._C._nn.linear(mul_23, l_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_fcs_modules_3_modules_linear_ads_parameters_weight_, None); l_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_fcs_modules_3_modules_linear_ads_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:521 in <lambda>, code: self.broadcast(res_shared[:, self.num_out_user :], ads_to_user_map), | |
| getitem_47: "bf16[4096, 960][1920, 1]cuda:0" = linear_32[(slice(None, None, None), slice(960, None, None))]; linear_32 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/common.py:64 in forward, code: return torch.index_select(x, dim=0, index=batch_indices) | |
| index_select_24: "bf16[4096, 960][960, 1]cuda:0" = torch.index_select(getitem_47, dim = 0, index = dummy_ads_to_user_map); getitem_47 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| res_ads_33: "bf16[4096, 960][960, 1]cuda:0" = linear_33 + index_select_24; linear_33 = index_select_24 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torch/nn/functional.py:2958 in rms_norm, code: return torch.rms_norm(input, normalized_shape, weight, eps) | |
| output_24: "f32[4096, 960][960, 1]cuda:0" = torch.rms_norm(res_user_33, (960,), l_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_acts_modules_3_modules_activation_modules_ln_user_modules_ln_parameters_weight_, None); res_user_33 = l_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_acts_modules_3_modules_activation_modules_ln_user_modules_ln_parameters_weight_ = None | |
| output_25: "f32[4096, 960][960, 1]cuda:0" = torch.rms_norm(res_ads_33, (960,), l_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_acts_modules_3_modules_activation_modules_ln_ads_modules_ln_parameters_weight_, None); res_ads_33 = l_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_acts_modules_3_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:321 in forward, code: return self.activation(x) | |
| sigmoid_24: "f32[4096, 960][960, 1]cuda:0" = torch.sigmoid(output_24) | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:249 in forward, code: x_user * self.act(x_user), | |
| mul_24: "f32[4096, 960][960, 1]cuda:0" = output_24 * sigmoid_24; output_24 = sigmoid_24 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:321 in forward, code: return self.activation(x) | |
| sigmoid_25: "f32[4096, 960][960, 1]cuda:0" = torch.sigmoid(output_25) | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:250 in forward, code: x_ads * self.act(x_ads), | |
| mul_25: "f32[4096, 960][960, 1]cuda:0" = output_25 * sigmoid_25; output_25 = sigmoid_25 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:515 in <lambda>, code: res_shared = self.user_compute(lambda: self.linear_shared(x_user)) | |
| linear_34: "bf16[4096, 4608][4608, 1]cuda:0" = torch._C._nn.linear(mul_24, l_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_fcs_modules_4_modules_linear_shared_parameters_weight_, None); mul_24 = l_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_fcs_modules_4_modules_linear_shared_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:516 in forward_linear, code: res_user = res_shared[:, : self.num_out_user] | |
| res_user_34: "bf16[4096, 2304][4608, 1]cuda:0" = linear_34[(slice(None, None, None), slice(None, 2304, None))] | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| linear_35: "bf16[4096, 2304][2304, 1]cuda:0" = torch._C._nn.linear(mul_25, l_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_fcs_modules_4_modules_linear_ads_parameters_weight_, None); mul_25 = l_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_fcs_modules_4_modules_linear_ads_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:521 in <lambda>, code: self.broadcast(res_shared[:, self.num_out_user :], ads_to_user_map), | |
| getitem_49: "bf16[4096, 2304][4608, 1]cuda:0" = linear_34[(slice(None, None, None), slice(2304, None, None))]; linear_34 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/common.py:64 in forward, code: return torch.index_select(x, dim=0, index=batch_indices) | |
| index_select_25: "bf16[4096, 2304][2304, 1]cuda:0" = torch.index_select(getitem_49, dim = 0, index = dummy_ads_to_user_map); getitem_49 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| res_ads_34: "bf16[4096, 2304][2304, 1]cuda:0" = linear_35 + index_select_25; linear_35 = index_select_25 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:131 in __add__, code: res_user = self.user + other.user | |
| res_user_35: "f32[4096, 2304][2304, 1]cuda:0" = mul_22 + res_user_34; mul_22 = res_user_34 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:132 in __add__, code: res_ads = self.ads + other.ads | |
| res_ads_35: "f32[4096, 2304][2304, 1]cuda:0" = mul_23 + res_ads_34; mul_23 = res_ads_34 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torch/nn/functional.py:2958 in rms_norm, code: return torch.rms_norm(input, normalized_shape, weight, eps) | |
| output_26: "f32[4096, 2304][2304, 1]cuda:0" = torch.rms_norm(res_user_35, (2304,), l_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_acts_modules_4_modules_activation_modules_ln_user_modules_ln_parameters_weight_, None); res_user_35 = l_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_acts_modules_4_modules_activation_modules_ln_user_modules_ln_parameters_weight_ = None | |
| output_27: "f32[4096, 2304][2304, 1]cuda:0" = torch.rms_norm(res_ads_35, (2304,), l_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_acts_modules_4_modules_activation_modules_ln_ads_modules_ln_parameters_weight_, None); res_ads_35 = l_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_acts_modules_4_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:321 in forward, code: return self.activation(x) | |
| sigmoid_26: "f32[4096, 2304][2304, 1]cuda:0" = torch.sigmoid(output_26) | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:249 in forward, code: x_user * self.act(x_user), | |
| mul_26: "f32[4096, 2304][2304, 1]cuda:0" = output_26 * sigmoid_26; output_26 = sigmoid_26 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:321 in forward, code: return self.activation(x) | |
| sigmoid_27: "f32[4096, 2304][2304, 1]cuda:0" = torch.sigmoid(output_27) | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:250 in forward, code: x_ads * self.act(x_ads), | |
| mul_27: "f32[4096, 2304][2304, 1]cuda:0" = output_27 * sigmoid_27; output_27 = sigmoid_27 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:515 in <lambda>, code: res_shared = self.user_compute(lambda: self.linear_shared(x_user)) | |
| linear_36: "bf16[4096, 12288][12288, 1]cuda:0" = torch._C._nn.linear(mul_26, l_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_output_fc_modules_linear_shared_parameters_weight_, None); mul_26 = l_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_output_fc_modules_linear_shared_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:516 in forward_linear, code: res_user = res_shared[:, : self.num_out_user] | |
| res_user_36: "bf16[4096, 6144][12288, 1]cuda:0" = linear_36[(slice(None, None, None), slice(None, 6144, None))] | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| linear_37: "bf16[4096, 6144][6144, 1]cuda:0" = torch._C._nn.linear(mul_27, l_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_output_fc_modules_linear_ads_parameters_weight_, None); mul_27 = l_self_modules_wukong_modules_layers_modules_1_modules_bitmlp_modules_output_fc_modules_linear_ads_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:521 in <lambda>, code: self.broadcast(res_shared[:, self.num_out_user :], ads_to_user_map), | |
| getitem_51: "bf16[4096, 6144][12288, 1]cuda:0" = linear_36[(slice(None, None, None), slice(6144, None, None))]; linear_36 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/common.py:64 in forward, code: return torch.index_select(x, dim=0, index=batch_indices) | |
| index_select_26: "bf16[4096, 6144][6144, 1]cuda:0" = torch.index_select(getitem_51, dim = 0, index = dummy_ads_to_user_map); getitem_51 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| res_ads_36: "bf16[4096, 6144][6144, 1]cuda:0" = linear_37 + index_select_26; linear_37 = index_select_26 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:181 in view_2d_to_3d, code: self.user.view(self.user.shape[0], num_emb, emb_dim), | |
| view_14: "bf16[4096, 48, 128][12288, 128, 1]cuda:0" = res_user_36.view(4096, -1, 128); res_user_36 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:182 in view_2d_to_3d, code: self.ads.view(self.ads.shape[0], num_emb, emb_dim), | |
| view_15: "bf16[4096, 48, 128][6144, 128, 1]cuda:0" = res_ads_36.view(4096, -1, 128); res_ads_36 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:488 in <lambda>, code: lambda: self.linear_shared.weight @ x_user | |
| matmul_22: "bf16[4096, 192, 128][24576, 128, 1]cuda:0" = l_self_modules_wukong_modules_layers_modules_1_modules_post_snn_lce_modules_fc_modules_linear_shared_parameters_weight_ @ view_14; l_self_modules_wukong_modules_layers_modules_1_modules_post_snn_lce_modules_fc_modules_linear_shared_parameters_weight_ = view_14 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:490 in forward_lce, code: res_user = res_shared[:, : self.num_out_user] | |
| res_user_37: "bf16[4096, 96, 128][24576, 128, 1]cuda:0" = matmul_22[(slice(None, None, None), slice(None, 96, None))] | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:493 in <lambda>, code: lambda: self.linear_ads.weight @ x_ads | |
| matmul_23: "bf16[4096, 96, 128][12288, 128, 1]cuda:0" = l_self_modules_wukong_modules_layers_modules_1_modules_post_snn_lce_modules_fc_modules_linear_ads_parameters_weight_ @ view_15; l_self_modules_wukong_modules_layers_modules_1_modules_post_snn_lce_modules_fc_modules_linear_ads_parameters_weight_ = view_15 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:496 in <lambda>, code: res_shared[:, self.num_out_user :], ads_to_user_map | |
| getitem_53: "bf16[4096, 96, 128][24576, 128, 1]cuda:0" = matmul_22[(slice(None, None, None), slice(96, None, None))]; matmul_22 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/common.py:64 in forward, code: return torch.index_select(x, dim=0, index=batch_indices) | |
| index_select_27: "bf16[4096, 96, 128][12288, 128, 1]cuda:0" = torch.index_select(getitem_53, dim = 0, index = dummy_ads_to_user_map); getitem_53 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:493 in <lambda>, code: lambda: self.linear_ads.weight @ x_ads | |
| res_ads_37: "bf16[4096, 96, 128][12288, 128, 1]cuda:0" = matmul_23 + index_select_27; matmul_23 = index_select_27 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:131 in __add__, code: res_user = self.user + other.user | |
| res_user_38: "bf16[4096, 96, 128][12288, 128, 1]cuda:0" = res_user_37 + res_user_21; res_user_37 = res_user_21 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:132 in __add__, code: res_ads = self.ads + other.ads | |
| res_ads_38: "bf16[4096, 96, 128][12288, 128, 1]cuda:0" = res_ads_37 + res_ads_21; res_ads_37 = res_ads_21 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:208 in cat, code: torch.cat([t.user for t in tensors], dim=1), | |
| cat_10: "bf16[4096, 160, 128][20480, 128, 1]cuda:0" = torch.cat([res_user_38, res_user_23], dim = 1); res_user_38 = res_user_23 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:209 in cat, code: torch.cat([t.ads for t in tensors], dim=1), | |
| cat_11: "bf16[4096, 160, 128][20480, 128, 1]cuda:0" = torch.cat([res_ads_38, res_ads_23], dim = 1); res_ads_38 = res_ads_23 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torch/nn/functional.py:2958 in rms_norm, code: return torch.rms_norm(input, normalized_shape, weight, eps) | |
| res_user_39: "f32[4096, 160, 128][20480, 128, 1]cuda:0" = torch.rms_norm(cat_10, (128,), l_self_modules_wukong_modules_layers_modules_1_modules_ln2_modules_ln_user_parameters_weight_, None); cat_10 = l_self_modules_wukong_modules_layers_modules_1_modules_ln2_modules_ln_user_parameters_weight_ = None | |
| res_ads_39: "f32[4096, 160, 128][20480, 128, 1]cuda:0" = torch.rms_norm(cat_11, (128,), l_self_modules_wukong_modules_layers_modules_1_modules_ln2_modules_ln_ads_parameters_weight_, None); cat_11 = l_self_modules_wukong_modules_layers_modules_1_modules_ln2_modules_ln_ads_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:488 in <lambda>, code: lambda: self.linear_shared.weight @ x_user | |
| matmul_24: "bf16[4096, 192, 128][24576, 128, 1]cuda:0" = l_self_modules_wukong_modules_layers_modules_2_modules_maybe_fused_lce_modules_lces_modules_0_modules_fc_modules_linear_shared_parameters_weight_ @ res_user_39; l_self_modules_wukong_modules_layers_modules_2_modules_maybe_fused_lce_modules_lces_modules_0_modules_fc_modules_linear_shared_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:490 in forward_lce, code: res_user = res_shared[:, : self.num_out_user] | |
| res_user_40: "bf16[4096, 96, 128][24576, 128, 1]cuda:0" = matmul_24[(slice(None, None, None), slice(None, 96, None))] | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:493 in <lambda>, code: lambda: self.linear_ads.weight @ x_ads | |
| matmul_25: "bf16[4096, 96, 128][12288, 128, 1]cuda:0" = l_self_modules_wukong_modules_layers_modules_2_modules_maybe_fused_lce_modules_lces_modules_0_modules_fc_modules_linear_ads_parameters_weight_ @ res_ads_39; l_self_modules_wukong_modules_layers_modules_2_modules_maybe_fused_lce_modules_lces_modules_0_modules_fc_modules_linear_ads_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:496 in <lambda>, code: res_shared[:, self.num_out_user :], ads_to_user_map | |
| getitem_55: "bf16[4096, 96, 128][24576, 128, 1]cuda:0" = matmul_24[(slice(None, None, None), slice(96, None, None))]; matmul_24 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/common.py:64 in forward, code: return torch.index_select(x, dim=0, index=batch_indices) | |
| index_select_28: "bf16[4096, 96, 128][12288, 128, 1]cuda:0" = torch.index_select(getitem_55, dim = 0, index = dummy_ads_to_user_map); getitem_55 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:493 in <lambda>, code: lambda: self.linear_ads.weight @ x_ads | |
| res_ads_40: "bf16[4096, 96, 128][12288, 128, 1]cuda:0" = matmul_25 + index_select_28; matmul_25 = index_select_28 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:488 in <lambda>, code: lambda: self.linear_shared.weight @ x_user | |
| matmul_26: "bf16[4096, 80, 128][10240, 128, 1]cuda:0" = l_self_modules_wukong_modules_layers_modules_2_modules_maybe_fused_lce_modules_lces_modules_1_modules_fc_modules_linear_shared_parameters_weight_ @ res_user_39; l_self_modules_wukong_modules_layers_modules_2_modules_maybe_fused_lce_modules_lces_modules_1_modules_fc_modules_linear_shared_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:490 in forward_lce, code: res_user = res_shared[:, : self.num_out_user] | |
| res_user_41: "bf16[4096, 40, 128][10240, 128, 1]cuda:0" = matmul_26[(slice(None, None, None), slice(None, 40, None))] | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:493 in <lambda>, code: lambda: self.linear_ads.weight @ x_ads | |
| matmul_27: "bf16[4096, 40, 128][5120, 128, 1]cuda:0" = l_self_modules_wukong_modules_layers_modules_2_modules_maybe_fused_lce_modules_lces_modules_1_modules_fc_modules_linear_ads_parameters_weight_ @ res_ads_39; l_self_modules_wukong_modules_layers_modules_2_modules_maybe_fused_lce_modules_lces_modules_1_modules_fc_modules_linear_ads_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:496 in <lambda>, code: res_shared[:, self.num_out_user :], ads_to_user_map | |
| getitem_57: "bf16[4096, 40, 128][10240, 128, 1]cuda:0" = matmul_26[(slice(None, None, None), slice(40, None, None))]; matmul_26 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/common.py:64 in forward, code: return torch.index_select(x, dim=0, index=batch_indices) | |
| index_select_29: "bf16[4096, 40, 128][5120, 128, 1]cuda:0" = torch.index_select(getitem_57, dim = 0, index = dummy_ads_to_user_map); getitem_57 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:493 in <lambda>, code: lambda: self.linear_ads.weight @ x_ads | |
| res_ads_41: "bf16[4096, 40, 128][5120, 128, 1]cuda:0" = matmul_27 + index_select_29; matmul_27 = index_select_29 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:488 in <lambda>, code: lambda: self.linear_shared.weight @ x_user | |
| matmul_28: "bf16[4096, 128, 128][16384, 128, 1]cuda:0" = l_self_modules_wukong_modules_layers_modules_2_modules_maybe_fused_lce_modules_lces_modules_2_modules_fc_modules_linear_shared_parameters_weight_ @ res_user_39; l_self_modules_wukong_modules_layers_modules_2_modules_maybe_fused_lce_modules_lces_modules_2_modules_fc_modules_linear_shared_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:490 in forward_lce, code: res_user = res_shared[:, : self.num_out_user] | |
| res_user_42: "bf16[4096, 64, 128][16384, 128, 1]cuda:0" = matmul_28[(slice(None, None, None), slice(None, 64, None))] | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:493 in <lambda>, code: lambda: self.linear_ads.weight @ x_ads | |
| matmul_29: "bf16[4096, 64, 128][8192, 128, 1]cuda:0" = l_self_modules_wukong_modules_layers_modules_2_modules_maybe_fused_lce_modules_lces_modules_2_modules_fc_modules_linear_ads_parameters_weight_ @ res_ads_39; l_self_modules_wukong_modules_layers_modules_2_modules_maybe_fused_lce_modules_lces_modules_2_modules_fc_modules_linear_ads_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:496 in <lambda>, code: res_shared[:, self.num_out_user :], ads_to_user_map | |
| getitem_59: "bf16[4096, 64, 128][16384, 128, 1]cuda:0" = matmul_28[(slice(None, None, None), slice(64, None, None))]; matmul_28 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/common.py:64 in forward, code: return torch.index_select(x, dim=0, index=batch_indices) | |
| index_select_30: "bf16[4096, 64, 128][8192, 128, 1]cuda:0" = torch.index_select(getitem_59, dim = 0, index = dummy_ads_to_user_map); getitem_59 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:493 in <lambda>, code: lambda: self.linear_ads.weight @ x_ads | |
| res_ads_42: "bf16[4096, 64, 128][8192, 128, 1]cuda:0" = matmul_29 + index_select_30; matmul_29 = index_select_30 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:189 in view_3d_to_2d, code: self.user.view(self.user.shape[0], emb_dim), | |
| view_16: "bf16[4096, 5120][10240, 1]cuda:0" = res_user_41.view(4096, -1); res_user_41 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:190 in view_3d_to_2d, code: self.ads.view(self.ads.shape[0], emb_dim), | |
| view_17: "bf16[4096, 5120][5120, 1]cuda:0" = res_ads_41.view(4096, -1); res_ads_41 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:515 in <lambda>, code: res_shared = self.user_compute(lambda: self.linear_shared(x_user)) | |
| linear_38: "bf16[4096, 1024][1024, 1]cuda:0" = torch._C._nn.linear(view_16, l_self_modules_wukong_modules_layers_modules_2_modules_bitattn_modules_v_proj_modules_fcs_modules_0_modules_linear_shared_parameters_weight_, None); view_16 = l_self_modules_wukong_modules_layers_modules_2_modules_bitattn_modules_v_proj_modules_fcs_modules_0_modules_linear_shared_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:516 in forward_linear, code: res_user = res_shared[:, : self.num_out_user] | |
| res_user_43: "bf16[4096, 512][1024, 1]cuda:0" = linear_38[(slice(None, None, None), slice(None, 512, None))] | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| linear_39: "bf16[4096, 512][512, 1]cuda:0" = torch._C._nn.linear(view_17, l_self_modules_wukong_modules_layers_modules_2_modules_bitattn_modules_v_proj_modules_fcs_modules_0_modules_linear_ads_parameters_weight_, None); view_17 = l_self_modules_wukong_modules_layers_modules_2_modules_bitattn_modules_v_proj_modules_fcs_modules_0_modules_linear_ads_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:521 in <lambda>, code: self.broadcast(res_shared[:, self.num_out_user :], ads_to_user_map), | |
| getitem_61: "bf16[4096, 512][1024, 1]cuda:0" = linear_38[(slice(None, None, None), slice(512, None, None))]; linear_38 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/common.py:64 in forward, code: return torch.index_select(x, dim=0, index=batch_indices) | |
| index_select_31: "bf16[4096, 512][512, 1]cuda:0" = torch.index_select(getitem_61, dim = 0, index = dummy_ads_to_user_map); getitem_61 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| res_ads_43: "bf16[4096, 512][512, 1]cuda:0" = linear_39 + index_select_31; linear_39 = index_select_31 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torch/nn/functional.py:2958 in rms_norm, code: return torch.rms_norm(input, normalized_shape, weight, eps) | |
| output_28: "f32[4096, 512][512, 1]cuda:0" = torch.rms_norm(res_user_43, (512,), l_self_modules_wukong_modules_layers_modules_2_modules_bitattn_modules_v_proj_modules_acts_modules_0_modules_activation_modules_ln_user_modules_ln_parameters_weight_, None); res_user_43 = l_self_modules_wukong_modules_layers_modules_2_modules_bitattn_modules_v_proj_modules_acts_modules_0_modules_activation_modules_ln_user_modules_ln_parameters_weight_ = None | |
| output_29: "f32[4096, 512][512, 1]cuda:0" = torch.rms_norm(res_ads_43, (512,), l_self_modules_wukong_modules_layers_modules_2_modules_bitattn_modules_v_proj_modules_acts_modules_0_modules_activation_modules_ln_ads_modules_ln_parameters_weight_, None); res_ads_43 = l_self_modules_wukong_modules_layers_modules_2_modules_bitattn_modules_v_proj_modules_acts_modules_0_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:321 in forward, code: return self.activation(x) | |
| sigmoid_28: "f32[4096, 512][512, 1]cuda:0" = torch.sigmoid(output_28) | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:249 in forward, code: x_user * self.act(x_user), | |
| mul_28: "f32[4096, 512][512, 1]cuda:0" = output_28 * sigmoid_28; output_28 = sigmoid_28 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:321 in forward, code: return self.activation(x) | |
| sigmoid_29: "f32[4096, 512][512, 1]cuda:0" = torch.sigmoid(output_29) | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:250 in forward, code: x_ads * self.act(x_ads), | |
| mul_29: "f32[4096, 512][512, 1]cuda:0" = output_29 * sigmoid_29; output_29 = sigmoid_29 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:515 in <lambda>, code: res_shared = self.user_compute(lambda: self.linear_shared(x_user)) | |
| linear_40: "bf16[4096, 1024][1024, 1]cuda:0" = torch._C._nn.linear(mul_28, l_self_modules_wukong_modules_layers_modules_2_modules_bitattn_modules_v_proj_modules_fcs_modules_1_modules_linear_shared_parameters_weight_, None); mul_28 = l_self_modules_wukong_modules_layers_modules_2_modules_bitattn_modules_v_proj_modules_fcs_modules_1_modules_linear_shared_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:516 in forward_linear, code: res_user = res_shared[:, : self.num_out_user] | |
| res_user_44: "bf16[4096, 512][1024, 1]cuda:0" = linear_40[(slice(None, None, None), slice(None, 512, None))] | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| linear_41: "bf16[4096, 512][512, 1]cuda:0" = torch._C._nn.linear(mul_29, l_self_modules_wukong_modules_layers_modules_2_modules_bitattn_modules_v_proj_modules_fcs_modules_1_modules_linear_ads_parameters_weight_, None); mul_29 = l_self_modules_wukong_modules_layers_modules_2_modules_bitattn_modules_v_proj_modules_fcs_modules_1_modules_linear_ads_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:521 in <lambda>, code: self.broadcast(res_shared[:, self.num_out_user :], ads_to_user_map), | |
| getitem_63: "bf16[4096, 512][1024, 1]cuda:0" = linear_40[(slice(None, None, None), slice(512, None, None))]; linear_40 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/common.py:64 in forward, code: return torch.index_select(x, dim=0, index=batch_indices) | |
| index_select_32: "bf16[4096, 512][512, 1]cuda:0" = torch.index_select(getitem_63, dim = 0, index = dummy_ads_to_user_map); getitem_63 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| res_ads_44: "bf16[4096, 512][512, 1]cuda:0" = linear_41 + index_select_32; linear_41 = index_select_32 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torch/nn/functional.py:2958 in rms_norm, code: return torch.rms_norm(input, normalized_shape, weight, eps) | |
| output_30: "f32[4096, 512][512, 1]cuda:0" = torch.rms_norm(res_user_44, (512,), l_self_modules_wukong_modules_layers_modules_2_modules_bitattn_modules_v_proj_modules_acts_modules_1_modules_activation_modules_ln_user_modules_ln_parameters_weight_, None); res_user_44 = l_self_modules_wukong_modules_layers_modules_2_modules_bitattn_modules_v_proj_modules_acts_modules_1_modules_activation_modules_ln_user_modules_ln_parameters_weight_ = None | |
| output_31: "f32[4096, 512][512, 1]cuda:0" = torch.rms_norm(res_ads_44, (512,), l_self_modules_wukong_modules_layers_modules_2_modules_bitattn_modules_v_proj_modules_acts_modules_1_modules_activation_modules_ln_ads_modules_ln_parameters_weight_, None); res_ads_44 = l_self_modules_wukong_modules_layers_modules_2_modules_bitattn_modules_v_proj_modules_acts_modules_1_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:321 in forward, code: return self.activation(x) | |
| sigmoid_30: "f32[4096, 512][512, 1]cuda:0" = torch.sigmoid(output_30) | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:249 in forward, code: x_user * self.act(x_user), | |
| mul_30: "f32[4096, 512][512, 1]cuda:0" = output_30 * sigmoid_30; output_30 = sigmoid_30 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:321 in forward, code: return self.activation(x) | |
| sigmoid_31: "f32[4096, 512][512, 1]cuda:0" = torch.sigmoid(output_31) | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:250 in forward, code: x_ads * self.act(x_ads), | |
| mul_31: "f32[4096, 512][512, 1]cuda:0" = output_31 * sigmoid_31; output_31 = sigmoid_31 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:515 in <lambda>, code: res_shared = self.user_compute(lambda: self.linear_shared(x_user)) | |
| linear_42: "bf16[4096, 25600][25600, 1]cuda:0" = torch._C._nn.linear(mul_30, l_self_modules_wukong_modules_layers_modules_2_modules_bitattn_modules_v_proj_modules_output_fc_modules_linear_shared_parameters_weight_, None); mul_30 = l_self_modules_wukong_modules_layers_modules_2_modules_bitattn_modules_v_proj_modules_output_fc_modules_linear_shared_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:516 in forward_linear, code: res_user = res_shared[:, : self.num_out_user] | |
| res_user_45: "bf16[4096, 12800][25600, 1]cuda:0" = linear_42[(slice(None, None, None), slice(None, 12800, None))] | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| linear_43: "bf16[4096, 12800][12800, 1]cuda:0" = torch._C._nn.linear(mul_31, l_self_modules_wukong_modules_layers_modules_2_modules_bitattn_modules_v_proj_modules_output_fc_modules_linear_ads_parameters_weight_, None); mul_31 = l_self_modules_wukong_modules_layers_modules_2_modules_bitattn_modules_v_proj_modules_output_fc_modules_linear_ads_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:521 in <lambda>, code: self.broadcast(res_shared[:, self.num_out_user :], ads_to_user_map), | |
| getitem_65: "bf16[4096, 12800][25600, 1]cuda:0" = linear_42[(slice(None, None, None), slice(12800, None, None))]; linear_42 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/common.py:64 in forward, code: return torch.index_select(x, dim=0, index=batch_indices) | |
| index_select_33: "bf16[4096, 12800][12800, 1]cuda:0" = torch.index_select(getitem_65, dim = 0, index = dummy_ads_to_user_map); getitem_65 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| res_ads_45: "bf16[4096, 12800][12800, 1]cuda:0" = linear_43 + index_select_33; linear_43 = index_select_33 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:181 in view_2d_to_3d, code: self.user.view(self.user.shape[0], num_emb, emb_dim), | |
| view_18: "bf16[4096, 40, 320][25600, 320, 1]cuda:0" = res_user_45.view(4096, -1, 320); res_user_45 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:182 in view_2d_to_3d, code: self.ads.view(self.ads.shape[0], num_emb, emb_dim), | |
| view_19: "bf16[4096, 40, 320][12800, 320, 1]cuda:0" = res_ads_45.view(4096, -1, 320); res_ads_45 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/common.py:64 in forward, code: return torch.index_select(x, dim=0, index=batch_indices) | |
| index_select_34: "f32[4096, 160, 128][20480, 128, 1]cuda:0" = torch.index_select(res_user_39, dim = 0, index = dummy_ads_to_user_map) | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:980 in forward, code: x_all = torch.cat( | |
| cat_12: "f32[4096, 320, 128][40960, 128, 1]cuda:0" = torch.cat([index_select_34, res_ads_39], dim = 1); index_select_34 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:989 in <lambda>, code: lambda: v_ads @ x_all | |
| matmul_30: "bf16[4096, 40, 128][5120, 128, 1]cuda:0" = view_19 @ cat_12; view_19 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:993 in <lambda>, code: lambda: v_user[:, :, :num_emb_user] @ x.user | |
| getitem_66: "bf16[4096, 40, 160][25600, 320, 1]cuda:0" = view_18[(slice(None, None, None), slice(None, None, None), slice(None, 160, None))]; view_18 = None | |
| matmul_31: "bf16[4096, 40, 128][5120, 128, 1]cuda:0" = getitem_66 @ res_user_39; getitem_66 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:1006 in <lambda>, code: lambda: vx_ads @ x_all.permute(0, 2, 1) | |
| permute_4: "f32[4096, 128, 320][40960, 1, 128]cuda:0" = cat_12.permute(0, 2, 1); cat_12 = None | |
| res_ads_46: "bf16[4096, 40, 320][12800, 320, 1]cuda:0" = matmul_30 @ permute_4; matmul_30 = permute_4 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:1010 in <lambda>, code: lambda: vx_user @ x.user.permute(0, 2, 1) | |
| permute_5: "f32[4096, 128, 160][20480, 1, 128]cuda:0" = res_user_39.permute(0, 2, 1) | |
| res_user_46: "bf16[4096, 40, 160][6400, 160, 1]cuda:0" = matmul_31 @ permute_5; matmul_31 = permute_5 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:189 in view_3d_to_2d, code: self.user.view(self.user.shape[0], emb_dim), | |
| view_20: "bf16[4096, 6400][6400, 1]cuda:0" = res_user_46.view(4096, -1); res_user_46 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:190 in view_3d_to_2d, code: self.ads.view(self.ads.shape[0], emb_dim), | |
| view_21: "bf16[4096, 12800][12800, 1]cuda:0" = res_ads_46.view(4096, -1); res_ads_46 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:208 in cat, code: torch.cat([t.user for t in tensors], dim=1), | |
| cat_13: "f32[4096, 10496][10496, 1]cuda:0" = torch.cat([view_20, res_user_1], dim = 1); view_20 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:209 in cat, code: torch.cat([t.ads for t in tensors], dim=1), | |
| cat_14: "f32[4096, 16896][16896, 1]cuda:0" = torch.cat([view_21, res_ads_1], dim = 1); view_21 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torch/nn/functional.py:2958 in rms_norm, code: return torch.rms_norm(input, normalized_shape, weight, eps) | |
| res_user_47: "f32[4096, 10496][10496, 1]cuda:0" = torch.rms_norm(cat_13, (10496,), l_self_modules_wukong_modules_layers_modules_2_modules_ln1_modules_ln_user_parameters_weight_, None); cat_13 = l_self_modules_wukong_modules_layers_modules_2_modules_ln1_modules_ln_user_parameters_weight_ = None | |
| res_ads_47: "f32[4096, 16896][16896, 1]cuda:0" = torch.rms_norm(cat_14, (16896,), l_self_modules_wukong_modules_layers_modules_2_modules_ln1_modules_ln_ads_parameters_weight_, None); cat_14 = l_self_modules_wukong_modules_layers_modules_2_modules_ln1_modules_ln_ads_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:515 in <lambda>, code: res_shared = self.user_compute(lambda: self.linear_shared(x_user)) | |
| linear_44: "bf16[4096, 9216][9216, 1]cuda:0" = torch._C._nn.linear(res_user_47, l_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_fcs_modules_0_modules_linear_shared_parameters_weight_, None); res_user_47 = l_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_fcs_modules_0_modules_linear_shared_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:516 in forward_linear, code: res_user = res_shared[:, : self.num_out_user] | |
| res_user_48: "bf16[4096, 4608][9216, 1]cuda:0" = linear_44[(slice(None, None, None), slice(None, 4608, None))] | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| linear_45: "bf16[4096, 4608][4608, 1]cuda:0" = torch._C._nn.linear(res_ads_47, l_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_fcs_modules_0_modules_linear_ads_parameters_weight_, None); res_ads_47 = l_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_fcs_modules_0_modules_linear_ads_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:521 in <lambda>, code: self.broadcast(res_shared[:, self.num_out_user :], ads_to_user_map), | |
| getitem_68: "bf16[4096, 4608][9216, 1]cuda:0" = linear_44[(slice(None, None, None), slice(4608, None, None))]; linear_44 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/common.py:64 in forward, code: return torch.index_select(x, dim=0, index=batch_indices) | |
| index_select_35: "bf16[4096, 4608][4608, 1]cuda:0" = torch.index_select(getitem_68, dim = 0, index = dummy_ads_to_user_map); getitem_68 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| res_ads_48: "bf16[4096, 4608][4608, 1]cuda:0" = linear_45 + index_select_35; linear_45 = index_select_35 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torch/nn/functional.py:2958 in rms_norm, code: return torch.rms_norm(input, normalized_shape, weight, eps) | |
| output_32: "f32[4096, 4608][4608, 1]cuda:0" = torch.rms_norm(res_user_48, (4608,), l_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_acts_modules_0_modules_activation_modules_ln_user_modules_ln_parameters_weight_, None); res_user_48 = l_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_acts_modules_0_modules_activation_modules_ln_user_modules_ln_parameters_weight_ = None | |
| output_33: "f32[4096, 4608][4608, 1]cuda:0" = torch.rms_norm(res_ads_48, (4608,), l_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_acts_modules_0_modules_activation_modules_ln_ads_modules_ln_parameters_weight_, None); res_ads_48 = l_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_acts_modules_0_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:321 in forward, code: return self.activation(x) | |
| sigmoid_32: "f32[4096, 4608][4608, 1]cuda:0" = torch.sigmoid(output_32) | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:249 in forward, code: x_user * self.act(x_user), | |
| mul_32: "f32[4096, 4608][4608, 1]cuda:0" = output_32 * sigmoid_32; output_32 = sigmoid_32 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:321 in forward, code: return self.activation(x) | |
| sigmoid_33: "f32[4096, 4608][4608, 1]cuda:0" = torch.sigmoid(output_33) | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:250 in forward, code: x_ads * self.act(x_ads), | |
| mul_33: "f32[4096, 4608][4608, 1]cuda:0" = output_33 * sigmoid_33; output_33 = sigmoid_33 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:515 in <lambda>, code: res_shared = self.user_compute(lambda: self.linear_shared(x_user)) | |
| linear_46: "bf16[4096, 4608][4608, 1]cuda:0" = torch._C._nn.linear(mul_32, l_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_fcs_modules_1_modules_linear_shared_parameters_weight_, None); l_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_fcs_modules_1_modules_linear_shared_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:516 in forward_linear, code: res_user = res_shared[:, : self.num_out_user] | |
| res_user_49: "bf16[4096, 2304][4608, 1]cuda:0" = linear_46[(slice(None, None, None), slice(None, 2304, None))] | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| linear_47: "bf16[4096, 2304][2304, 1]cuda:0" = torch._C._nn.linear(mul_33, l_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_fcs_modules_1_modules_linear_ads_parameters_weight_, None); l_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_fcs_modules_1_modules_linear_ads_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:521 in <lambda>, code: self.broadcast(res_shared[:, self.num_out_user :], ads_to_user_map), | |
| getitem_70: "bf16[4096, 2304][4608, 1]cuda:0" = linear_46[(slice(None, None, None), slice(2304, None, None))]; linear_46 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/common.py:64 in forward, code: return torch.index_select(x, dim=0, index=batch_indices) | |
| index_select_36: "bf16[4096, 2304][2304, 1]cuda:0" = torch.index_select(getitem_70, dim = 0, index = dummy_ads_to_user_map); getitem_70 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| res_ads_49: "bf16[4096, 2304][2304, 1]cuda:0" = linear_47 + index_select_36; linear_47 = index_select_36 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torch/nn/functional.py:2958 in rms_norm, code: return torch.rms_norm(input, normalized_shape, weight, eps) | |
| output_34: "f32[4096, 2304][2304, 1]cuda:0" = torch.rms_norm(res_user_49, (2304,), l_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_acts_modules_1_modules_activation_modules_ln_user_modules_ln_parameters_weight_, None); res_user_49 = l_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_acts_modules_1_modules_activation_modules_ln_user_modules_ln_parameters_weight_ = None | |
| output_35: "f32[4096, 2304][2304, 1]cuda:0" = torch.rms_norm(res_ads_49, (2304,), l_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_acts_modules_1_modules_activation_modules_ln_ads_modules_ln_parameters_weight_, None); res_ads_49 = l_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_acts_modules_1_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:321 in forward, code: return self.activation(x) | |
| sigmoid_34: "f32[4096, 2304][2304, 1]cuda:0" = torch.sigmoid(output_34) | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:249 in forward, code: x_user * self.act(x_user), | |
| mul_34: "f32[4096, 2304][2304, 1]cuda:0" = output_34 * sigmoid_34; output_34 = sigmoid_34 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:321 in forward, code: return self.activation(x) | |
| sigmoid_35: "f32[4096, 2304][2304, 1]cuda:0" = torch.sigmoid(output_35) | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:250 in forward, code: x_ads * self.act(x_ads), | |
| mul_35: "f32[4096, 2304][2304, 1]cuda:0" = output_35 * sigmoid_35; output_35 = sigmoid_35 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:515 in <lambda>, code: res_shared = self.user_compute(lambda: self.linear_shared(x_user)) | |
| linear_48: "bf16[4096, 9216][9216, 1]cuda:0" = torch._C._nn.linear(mul_34, l_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_fcs_modules_2_modules_linear_shared_parameters_weight_, None); mul_34 = l_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_fcs_modules_2_modules_linear_shared_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:516 in forward_linear, code: res_user = res_shared[:, : self.num_out_user] | |
| res_user_50: "bf16[4096, 4608][9216, 1]cuda:0" = linear_48[(slice(None, None, None), slice(None, 4608, None))] | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| linear_49: "bf16[4096, 4608][4608, 1]cuda:0" = torch._C._nn.linear(mul_35, l_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_fcs_modules_2_modules_linear_ads_parameters_weight_, None); mul_35 = l_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_fcs_modules_2_modules_linear_ads_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:521 in <lambda>, code: self.broadcast(res_shared[:, self.num_out_user :], ads_to_user_map), | |
| getitem_72: "bf16[4096, 4608][9216, 1]cuda:0" = linear_48[(slice(None, None, None), slice(4608, None, None))]; linear_48 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/common.py:64 in forward, code: return torch.index_select(x, dim=0, index=batch_indices) | |
| index_select_37: "bf16[4096, 4608][4608, 1]cuda:0" = torch.index_select(getitem_72, dim = 0, index = dummy_ads_to_user_map); getitem_72 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| res_ads_50: "bf16[4096, 4608][4608, 1]cuda:0" = linear_49 + index_select_37; linear_49 = index_select_37 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:131 in __add__, code: res_user = self.user + other.user | |
| res_user_51: "f32[4096, 4608][4608, 1]cuda:0" = mul_32 + res_user_50; mul_32 = res_user_50 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:132 in __add__, code: res_ads = self.ads + other.ads | |
| res_ads_51: "f32[4096, 4608][4608, 1]cuda:0" = mul_33 + res_ads_50; mul_33 = res_ads_50 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torch/nn/functional.py:2958 in rms_norm, code: return torch.rms_norm(input, normalized_shape, weight, eps) | |
| output_36: "f32[4096, 4608][4608, 1]cuda:0" = torch.rms_norm(res_user_51, (4608,), l_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_acts_modules_2_modules_activation_modules_ln_user_modules_ln_parameters_weight_, None); res_user_51 = l_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_acts_modules_2_modules_activation_modules_ln_user_modules_ln_parameters_weight_ = None | |
| output_37: "f32[4096, 4608][4608, 1]cuda:0" = torch.rms_norm(res_ads_51, (4608,), l_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_acts_modules_2_modules_activation_modules_ln_ads_modules_ln_parameters_weight_, None); res_ads_51 = l_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_acts_modules_2_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:321 in forward, code: return self.activation(x) | |
| sigmoid_36: "f32[4096, 4608][4608, 1]cuda:0" = torch.sigmoid(output_36) | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:249 in forward, code: x_user * self.act(x_user), | |
| mul_36: "f32[4096, 4608][4608, 1]cuda:0" = output_36 * sigmoid_36; output_36 = sigmoid_36 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:321 in forward, code: return self.activation(x) | |
| sigmoid_37: "f32[4096, 4608][4608, 1]cuda:0" = torch.sigmoid(output_37) | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:250 in forward, code: x_ads * self.act(x_ads), | |
| mul_37: "f32[4096, 4608][4608, 1]cuda:0" = output_37 * sigmoid_37; output_37 = sigmoid_37 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:515 in <lambda>, code: res_shared = self.user_compute(lambda: self.linear_shared(x_user)) | |
| linear_50: "bf16[4096, 3072][3072, 1]cuda:0" = torch._C._nn.linear(mul_36, l_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_fcs_modules_3_modules_linear_shared_parameters_weight_, None); l_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_fcs_modules_3_modules_linear_shared_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:516 in forward_linear, code: res_user = res_shared[:, : self.num_out_user] | |
| res_user_52: "bf16[4096, 1536][3072, 1]cuda:0" = linear_50[(slice(None, None, None), slice(None, 1536, None))] | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| linear_51: "bf16[4096, 1536][1536, 1]cuda:0" = torch._C._nn.linear(mul_37, l_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_fcs_modules_3_modules_linear_ads_parameters_weight_, None); l_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_fcs_modules_3_modules_linear_ads_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:521 in <lambda>, code: self.broadcast(res_shared[:, self.num_out_user :], ads_to_user_map), | |
| getitem_74: "bf16[4096, 1536][3072, 1]cuda:0" = linear_50[(slice(None, None, None), slice(1536, None, None))]; linear_50 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/common.py:64 in forward, code: return torch.index_select(x, dim=0, index=batch_indices) | |
| index_select_38: "bf16[4096, 1536][1536, 1]cuda:0" = torch.index_select(getitem_74, dim = 0, index = dummy_ads_to_user_map); getitem_74 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| res_ads_52: "bf16[4096, 1536][1536, 1]cuda:0" = linear_51 + index_select_38; linear_51 = index_select_38 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torch/nn/functional.py:2958 in rms_norm, code: return torch.rms_norm(input, normalized_shape, weight, eps) | |
| output_38: "f32[4096, 1536][1536, 1]cuda:0" = torch.rms_norm(res_user_52, (1536,), l_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_acts_modules_3_modules_activation_modules_ln_user_modules_ln_parameters_weight_, None); res_user_52 = l_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_acts_modules_3_modules_activation_modules_ln_user_modules_ln_parameters_weight_ = None | |
| output_39: "f32[4096, 1536][1536, 1]cuda:0" = torch.rms_norm(res_ads_52, (1536,), l_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_acts_modules_3_modules_activation_modules_ln_ads_modules_ln_parameters_weight_, None); res_ads_52 = l_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_acts_modules_3_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:321 in forward, code: return self.activation(x) | |
| sigmoid_38: "f32[4096, 1536][1536, 1]cuda:0" = torch.sigmoid(output_38) | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:249 in forward, code: x_user * self.act(x_user), | |
| mul_38: "f32[4096, 1536][1536, 1]cuda:0" = output_38 * sigmoid_38; output_38 = sigmoid_38 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:321 in forward, code: return self.activation(x) | |
| sigmoid_39: "f32[4096, 1536][1536, 1]cuda:0" = torch.sigmoid(output_39) | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:250 in forward, code: x_ads * self.act(x_ads), | |
| mul_39: "f32[4096, 1536][1536, 1]cuda:0" = output_39 * sigmoid_39; output_39 = sigmoid_39 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:515 in <lambda>, code: res_shared = self.user_compute(lambda: self.linear_shared(x_user)) | |
| linear_52: "bf16[4096, 9216][9216, 1]cuda:0" = torch._C._nn.linear(mul_38, l_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_fcs_modules_4_modules_linear_shared_parameters_weight_, None); mul_38 = l_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_fcs_modules_4_modules_linear_shared_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:516 in forward_linear, code: res_user = res_shared[:, : self.num_out_user] | |
| res_user_53: "bf16[4096, 4608][9216, 1]cuda:0" = linear_52[(slice(None, None, None), slice(None, 4608, None))] | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| linear_53: "bf16[4096, 4608][4608, 1]cuda:0" = torch._C._nn.linear(mul_39, l_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_fcs_modules_4_modules_linear_ads_parameters_weight_, None); mul_39 = l_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_fcs_modules_4_modules_linear_ads_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:521 in <lambda>, code: self.broadcast(res_shared[:, self.num_out_user :], ads_to_user_map), | |
| getitem_76: "bf16[4096, 4608][9216, 1]cuda:0" = linear_52[(slice(None, None, None), slice(4608, None, None))]; linear_52 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/common.py:64 in forward, code: return torch.index_select(x, dim=0, index=batch_indices) | |
| index_select_39: "bf16[4096, 4608][4608, 1]cuda:0" = torch.index_select(getitem_76, dim = 0, index = dummy_ads_to_user_map); getitem_76 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| res_ads_53: "bf16[4096, 4608][4608, 1]cuda:0" = linear_53 + index_select_39; linear_53 = index_select_39 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:131 in __add__, code: res_user = self.user + other.user | |
| res_user_54: "f32[4096, 4608][4608, 1]cuda:0" = mul_36 + res_user_53; mul_36 = res_user_53 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:132 in __add__, code: res_ads = self.ads + other.ads | |
| res_ads_54: "f32[4096, 4608][4608, 1]cuda:0" = mul_37 + res_ads_53; mul_37 = res_ads_53 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torch/nn/functional.py:2958 in rms_norm, code: return torch.rms_norm(input, normalized_shape, weight, eps) | |
| output_40: "f32[4096, 4608][4608, 1]cuda:0" = torch.rms_norm(res_user_54, (4608,), l_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_acts_modules_4_modules_activation_modules_ln_user_modules_ln_parameters_weight_, None); res_user_54 = l_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_acts_modules_4_modules_activation_modules_ln_user_modules_ln_parameters_weight_ = None | |
| output_41: "f32[4096, 4608][4608, 1]cuda:0" = torch.rms_norm(res_ads_54, (4608,), l_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_acts_modules_4_modules_activation_modules_ln_ads_modules_ln_parameters_weight_, None); res_ads_54 = l_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_acts_modules_4_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:321 in forward, code: return self.activation(x) | |
| sigmoid_40: "f32[4096, 4608][4608, 1]cuda:0" = torch.sigmoid(output_40) | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:249 in forward, code: x_user * self.act(x_user), | |
| mul_40: "f32[4096, 4608][4608, 1]cuda:0" = output_40 * sigmoid_40; output_40 = sigmoid_40 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:321 in forward, code: return self.activation(x) | |
| sigmoid_41: "f32[4096, 4608][4608, 1]cuda:0" = torch.sigmoid(output_41) | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:250 in forward, code: x_ads * self.act(x_ads), | |
| mul_41: "f32[4096, 4608][4608, 1]cuda:0" = output_41 * sigmoid_41; output_41 = sigmoid_41 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:515 in <lambda>, code: res_shared = self.user_compute(lambda: self.linear_shared(x_user)) | |
| linear_54: "bf16[4096, 12288][12288, 1]cuda:0" = torch._C._nn.linear(mul_40, l_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_output_fc_modules_linear_shared_parameters_weight_, None); mul_40 = l_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_output_fc_modules_linear_shared_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:516 in forward_linear, code: res_user = res_shared[:, : self.num_out_user] | |
| res_user_55: "bf16[4096, 6144][12288, 1]cuda:0" = linear_54[(slice(None, None, None), slice(None, 6144, None))] | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| linear_55: "bf16[4096, 6144][6144, 1]cuda:0" = torch._C._nn.linear(mul_41, l_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_output_fc_modules_linear_ads_parameters_weight_, None); mul_41 = l_self_modules_wukong_modules_layers_modules_2_modules_bitmlp_modules_output_fc_modules_linear_ads_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:521 in <lambda>, code: self.broadcast(res_shared[:, self.num_out_user :], ads_to_user_map), | |
| getitem_78: "bf16[4096, 6144][12288, 1]cuda:0" = linear_54[(slice(None, None, None), slice(6144, None, None))]; linear_54 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/common.py:64 in forward, code: return torch.index_select(x, dim=0, index=batch_indices) | |
| index_select_40: "bf16[4096, 6144][6144, 1]cuda:0" = torch.index_select(getitem_78, dim = 0, index = dummy_ads_to_user_map); getitem_78 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| res_ads_55: "bf16[4096, 6144][6144, 1]cuda:0" = linear_55 + index_select_40; linear_55 = index_select_40 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:181 in view_2d_to_3d, code: self.user.view(self.user.shape[0], num_emb, emb_dim), | |
| view_22: "bf16[4096, 48, 128][12288, 128, 1]cuda:0" = res_user_55.view(4096, -1, 128); res_user_55 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:182 in view_2d_to_3d, code: self.ads.view(self.ads.shape[0], num_emb, emb_dim), | |
| view_23: "bf16[4096, 48, 128][6144, 128, 1]cuda:0" = res_ads_55.view(4096, -1, 128); res_ads_55 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:488 in <lambda>, code: lambda: self.linear_shared.weight @ x_user | |
| matmul_34: "bf16[4096, 192, 128][24576, 128, 1]cuda:0" = l_self_modules_wukong_modules_layers_modules_2_modules_post_snn_lce_modules_fc_modules_linear_shared_parameters_weight_ @ view_22; l_self_modules_wukong_modules_layers_modules_2_modules_post_snn_lce_modules_fc_modules_linear_shared_parameters_weight_ = view_22 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:490 in forward_lce, code: res_user = res_shared[:, : self.num_out_user] | |
| res_user_56: "bf16[4096, 96, 128][24576, 128, 1]cuda:0" = matmul_34[(slice(None, None, None), slice(None, 96, None))] | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:493 in <lambda>, code: lambda: self.linear_ads.weight @ x_ads | |
| matmul_35: "bf16[4096, 96, 128][12288, 128, 1]cuda:0" = l_self_modules_wukong_modules_layers_modules_2_modules_post_snn_lce_modules_fc_modules_linear_ads_parameters_weight_ @ view_23; l_self_modules_wukong_modules_layers_modules_2_modules_post_snn_lce_modules_fc_modules_linear_ads_parameters_weight_ = view_23 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:496 in <lambda>, code: res_shared[:, self.num_out_user :], ads_to_user_map | |
| getitem_80: "bf16[4096, 96, 128][24576, 128, 1]cuda:0" = matmul_34[(slice(None, None, None), slice(96, None, None))]; matmul_34 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/common.py:64 in forward, code: return torch.index_select(x, dim=0, index=batch_indices) | |
| index_select_41: "bf16[4096, 96, 128][12288, 128, 1]cuda:0" = torch.index_select(getitem_80, dim = 0, index = dummy_ads_to_user_map); getitem_80 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:493 in <lambda>, code: lambda: self.linear_ads.weight @ x_ads | |
| res_ads_56: "bf16[4096, 96, 128][12288, 128, 1]cuda:0" = matmul_35 + index_select_41; matmul_35 = index_select_41 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:131 in __add__, code: res_user = self.user + other.user | |
| res_user_57: "bf16[4096, 96, 128][12288, 128, 1]cuda:0" = res_user_56 + res_user_40; res_user_56 = res_user_40 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:132 in __add__, code: res_ads = self.ads + other.ads | |
| res_ads_57: "bf16[4096, 96, 128][12288, 128, 1]cuda:0" = res_ads_56 + res_ads_40; res_ads_56 = res_ads_40 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:208 in cat, code: torch.cat([t.user for t in tensors], dim=1), | |
| cat_15: "bf16[4096, 160, 128][20480, 128, 1]cuda:0" = torch.cat([res_user_57, res_user_42], dim = 1); res_user_57 = res_user_42 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:209 in cat, code: torch.cat([t.ads for t in tensors], dim=1), | |
| cat_16: "bf16[4096, 160, 128][20480, 128, 1]cuda:0" = torch.cat([res_ads_57, res_ads_42], dim = 1); res_ads_57 = res_ads_42 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torch/nn/functional.py:2958 in rms_norm, code: return torch.rms_norm(input, normalized_shape, weight, eps) | |
| res_user_58: "f32[4096, 160, 128][20480, 128, 1]cuda:0" = torch.rms_norm(cat_15, (128,), l_self_modules_wukong_modules_layers_modules_2_modules_ln2_modules_ln_user_parameters_weight_, None); cat_15 = l_self_modules_wukong_modules_layers_modules_2_modules_ln2_modules_ln_user_parameters_weight_ = None | |
| res_ads_58: "f32[4096, 160, 128][20480, 128, 1]cuda:0" = torch.rms_norm(cat_16, (128,), l_self_modules_wukong_modules_layers_modules_2_modules_ln2_modules_ln_ads_parameters_weight_, None); cat_16 = l_self_modules_wukong_modules_layers_modules_2_modules_ln2_modules_ln_ads_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:1155 in forward, code: torch.empty_like(x.user[:, 0:0]), | |
| getitem_81: "f32[4096, 0, 128][20480, 128, 1]cuda:0" = res_user_58[(slice(None, None, None), slice(0, 0, None))] | |
| empty_like: "f32[4096, 0, 128][128, 128, 1]cuda:0" = torch.empty_like(getitem_81); getitem_81 = empty_like = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:1156 in forward, code: torch.empty_like(x.ads[:, 0:0]), | |
| getitem_82: "f32[4096, 0, 128][20480, 128, 1]cuda:0" = res_ads_58[(slice(None, None, None), slice(0, 0, None))] | |
| empty_like_1: "f32[4096, 0, 128][128, 128, 1]cuda:0" = torch.empty_like(getitem_82); getitem_82 = empty_like_1 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:488 in <lambda>, code: lambda: self.linear_shared.weight @ x_user | |
| matmul_36: "bf16[4096, 80, 128][10240, 128, 1]cuda:0" = l_self_modules_wukong_modules_layers_modules_3_modules_maybe_fused_lce_modules_lces_modules_1_modules_fc_modules_linear_shared_parameters_weight_ @ res_user_58; l_self_modules_wukong_modules_layers_modules_3_modules_maybe_fused_lce_modules_lces_modules_1_modules_fc_modules_linear_shared_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:490 in forward_lce, code: res_user = res_shared[:, : self.num_out_user] | |
| res_user_59: "bf16[4096, 40, 128][10240, 128, 1]cuda:0" = matmul_36[(slice(None, None, None), slice(None, 40, None))] | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:493 in <lambda>, code: lambda: self.linear_ads.weight @ x_ads | |
| matmul_37: "bf16[4096, 40, 128][5120, 128, 1]cuda:0" = l_self_modules_wukong_modules_layers_modules_3_modules_maybe_fused_lce_modules_lces_modules_1_modules_fc_modules_linear_ads_parameters_weight_ @ res_ads_58; l_self_modules_wukong_modules_layers_modules_3_modules_maybe_fused_lce_modules_lces_modules_1_modules_fc_modules_linear_ads_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:496 in <lambda>, code: res_shared[:, self.num_out_user :], ads_to_user_map | |
| getitem_84: "bf16[4096, 40, 128][10240, 128, 1]cuda:0" = matmul_36[(slice(None, None, None), slice(40, None, None))]; matmul_36 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/common.py:64 in forward, code: return torch.index_select(x, dim=0, index=batch_indices) | |
| index_select_42: "bf16[4096, 40, 128][5120, 128, 1]cuda:0" = torch.index_select(getitem_84, dim = 0, index = dummy_ads_to_user_map); getitem_84 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:493 in <lambda>, code: lambda: self.linear_ads.weight @ x_ads | |
| res_ads_59: "bf16[4096, 40, 128][5120, 128, 1]cuda:0" = matmul_37 + index_select_42; matmul_37 = index_select_42 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:1155 in forward, code: torch.empty_like(x.user[:, 0:0]), | |
| getitem_85: "f32[4096, 0, 128][20480, 128, 1]cuda:0" = res_user_58[(slice(None, None, None), slice(0, 0, None))] | |
| empty_like_2: "f32[4096, 0, 128][128, 128, 1]cuda:0" = torch.empty_like(getitem_85); getitem_85 = empty_like_2 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:1156 in forward, code: torch.empty_like(x.ads[:, 0:0]), | |
| getitem_86: "f32[4096, 0, 128][20480, 128, 1]cuda:0" = res_ads_58[(slice(None, None, None), slice(0, 0, None))] | |
| empty_like_3: "f32[4096, 0, 128][128, 128, 1]cuda:0" = torch.empty_like(getitem_86); getitem_86 = empty_like_3 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:189 in view_3d_to_2d, code: self.user.view(self.user.shape[0], emb_dim), | |
| view_24: "bf16[4096, 5120][10240, 1]cuda:0" = res_user_59.view(4096, -1); res_user_59 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:190 in view_3d_to_2d, code: self.ads.view(self.ads.shape[0], emb_dim), | |
| view_25: "bf16[4096, 5120][5120, 1]cuda:0" = res_ads_59.view(4096, -1); res_ads_59 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:515 in <lambda>, code: res_shared = self.user_compute(lambda: self.linear_shared(x_user)) | |
| linear_56: "bf16[4096, 1024][1024, 1]cuda:0" = torch._C._nn.linear(view_24, l_self_modules_wukong_modules_layers_modules_3_modules_bitattn_modules_v_proj_modules_fcs_modules_0_modules_linear_shared_parameters_weight_, None); view_24 = l_self_modules_wukong_modules_layers_modules_3_modules_bitattn_modules_v_proj_modules_fcs_modules_0_modules_linear_shared_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:516 in forward_linear, code: res_user = res_shared[:, : self.num_out_user] | |
| res_user_60: "bf16[4096, 512][1024, 1]cuda:0" = linear_56[(slice(None, None, None), slice(None, 512, None))] | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| linear_57: "bf16[4096, 512][512, 1]cuda:0" = torch._C._nn.linear(view_25, l_self_modules_wukong_modules_layers_modules_3_modules_bitattn_modules_v_proj_modules_fcs_modules_0_modules_linear_ads_parameters_weight_, None); view_25 = l_self_modules_wukong_modules_layers_modules_3_modules_bitattn_modules_v_proj_modules_fcs_modules_0_modules_linear_ads_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:521 in <lambda>, code: self.broadcast(res_shared[:, self.num_out_user :], ads_to_user_map), | |
| getitem_88: "bf16[4096, 512][1024, 1]cuda:0" = linear_56[(slice(None, None, None), slice(512, None, None))]; linear_56 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/common.py:64 in forward, code: return torch.index_select(x, dim=0, index=batch_indices) | |
| index_select_43: "bf16[4096, 512][512, 1]cuda:0" = torch.index_select(getitem_88, dim = 0, index = dummy_ads_to_user_map); getitem_88 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| res_ads_60: "bf16[4096, 512][512, 1]cuda:0" = linear_57 + index_select_43; linear_57 = index_select_43 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torch/nn/functional.py:2958 in rms_norm, code: return torch.rms_norm(input, normalized_shape, weight, eps) | |
| output_42: "f32[4096, 512][512, 1]cuda:0" = torch.rms_norm(res_user_60, (512,), l_self_modules_wukong_modules_layers_modules_3_modules_bitattn_modules_v_proj_modules_acts_modules_0_modules_activation_modules_ln_user_modules_ln_parameters_weight_, None); res_user_60 = l_self_modules_wukong_modules_layers_modules_3_modules_bitattn_modules_v_proj_modules_acts_modules_0_modules_activation_modules_ln_user_modules_ln_parameters_weight_ = None | |
| output_43: "f32[4096, 512][512, 1]cuda:0" = torch.rms_norm(res_ads_60, (512,), l_self_modules_wukong_modules_layers_modules_3_modules_bitattn_modules_v_proj_modules_acts_modules_0_modules_activation_modules_ln_ads_modules_ln_parameters_weight_, None); res_ads_60 = l_self_modules_wukong_modules_layers_modules_3_modules_bitattn_modules_v_proj_modules_acts_modules_0_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:321 in forward, code: return self.activation(x) | |
| sigmoid_42: "f32[4096, 512][512, 1]cuda:0" = torch.sigmoid(output_42) | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:249 in forward, code: x_user * self.act(x_user), | |
| mul_42: "f32[4096, 512][512, 1]cuda:0" = output_42 * sigmoid_42; output_42 = sigmoid_42 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:321 in forward, code: return self.activation(x) | |
| sigmoid_43: "f32[4096, 512][512, 1]cuda:0" = torch.sigmoid(output_43) | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:250 in forward, code: x_ads * self.act(x_ads), | |
| mul_43: "f32[4096, 512][512, 1]cuda:0" = output_43 * sigmoid_43; output_43 = sigmoid_43 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:515 in <lambda>, code: res_shared = self.user_compute(lambda: self.linear_shared(x_user)) | |
| linear_58: "bf16[4096, 1024][1024, 1]cuda:0" = torch._C._nn.linear(mul_42, l_self_modules_wukong_modules_layers_modules_3_modules_bitattn_modules_v_proj_modules_fcs_modules_1_modules_linear_shared_parameters_weight_, None); mul_42 = l_self_modules_wukong_modules_layers_modules_3_modules_bitattn_modules_v_proj_modules_fcs_modules_1_modules_linear_shared_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:516 in forward_linear, code: res_user = res_shared[:, : self.num_out_user] | |
| res_user_61: "bf16[4096, 512][1024, 1]cuda:0" = linear_58[(slice(None, None, None), slice(None, 512, None))] | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| linear_59: "bf16[4096, 512][512, 1]cuda:0" = torch._C._nn.linear(mul_43, l_self_modules_wukong_modules_layers_modules_3_modules_bitattn_modules_v_proj_modules_fcs_modules_1_modules_linear_ads_parameters_weight_, None); mul_43 = l_self_modules_wukong_modules_layers_modules_3_modules_bitattn_modules_v_proj_modules_fcs_modules_1_modules_linear_ads_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:521 in <lambda>, code: self.broadcast(res_shared[:, self.num_out_user :], ads_to_user_map), | |
| getitem_90: "bf16[4096, 512][1024, 1]cuda:0" = linear_58[(slice(None, None, None), slice(512, None, None))]; linear_58 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/common.py:64 in forward, code: return torch.index_select(x, dim=0, index=batch_indices) | |
| index_select_44: "bf16[4096, 512][512, 1]cuda:0" = torch.index_select(getitem_90, dim = 0, index = dummy_ads_to_user_map); getitem_90 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| res_ads_61: "bf16[4096, 512][512, 1]cuda:0" = linear_59 + index_select_44; linear_59 = index_select_44 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torch/nn/functional.py:2958 in rms_norm, code: return torch.rms_norm(input, normalized_shape, weight, eps) | |
| output_44: "f32[4096, 512][512, 1]cuda:0" = torch.rms_norm(res_user_61, (512,), l_self_modules_wukong_modules_layers_modules_3_modules_bitattn_modules_v_proj_modules_acts_modules_1_modules_activation_modules_ln_user_modules_ln_parameters_weight_, None); res_user_61 = l_self_modules_wukong_modules_layers_modules_3_modules_bitattn_modules_v_proj_modules_acts_modules_1_modules_activation_modules_ln_user_modules_ln_parameters_weight_ = None | |
| output_45: "f32[4096, 512][512, 1]cuda:0" = torch.rms_norm(res_ads_61, (512,), l_self_modules_wukong_modules_layers_modules_3_modules_bitattn_modules_v_proj_modules_acts_modules_1_modules_activation_modules_ln_ads_modules_ln_parameters_weight_, None); res_ads_61 = l_self_modules_wukong_modules_layers_modules_3_modules_bitattn_modules_v_proj_modules_acts_modules_1_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:321 in forward, code: return self.activation(x) | |
| sigmoid_44: "f32[4096, 512][512, 1]cuda:0" = torch.sigmoid(output_44) | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:249 in forward, code: x_user * self.act(x_user), | |
| mul_44: "f32[4096, 512][512, 1]cuda:0" = output_44 * sigmoid_44; output_44 = sigmoid_44 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:321 in forward, code: return self.activation(x) | |
| sigmoid_45: "f32[4096, 512][512, 1]cuda:0" = torch.sigmoid(output_45) | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:250 in forward, code: x_ads * self.act(x_ads), | |
| mul_45: "f32[4096, 512][512, 1]cuda:0" = output_45 * sigmoid_45; output_45 = sigmoid_45 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:515 in <lambda>, code: res_shared = self.user_compute(lambda: self.linear_shared(x_user)) | |
| linear_60: "bf16[4096, 25600][25600, 1]cuda:0" = torch._C._nn.linear(mul_44, l_self_modules_wukong_modules_layers_modules_3_modules_bitattn_modules_v_proj_modules_output_fc_modules_linear_shared_parameters_weight_, None); mul_44 = l_self_modules_wukong_modules_layers_modules_3_modules_bitattn_modules_v_proj_modules_output_fc_modules_linear_shared_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:516 in forward_linear, code: res_user = res_shared[:, : self.num_out_user] | |
| res_user_62: "bf16[4096, 12800][25600, 1]cuda:0" = linear_60[(slice(None, None, None), slice(None, 12800, None))] | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| linear_61: "bf16[4096, 12800][12800, 1]cuda:0" = torch._C._nn.linear(mul_45, l_self_modules_wukong_modules_layers_modules_3_modules_bitattn_modules_v_proj_modules_output_fc_modules_linear_ads_parameters_weight_, None); mul_45 = l_self_modules_wukong_modules_layers_modules_3_modules_bitattn_modules_v_proj_modules_output_fc_modules_linear_ads_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:521 in <lambda>, code: self.broadcast(res_shared[:, self.num_out_user :], ads_to_user_map), | |
| getitem_92: "bf16[4096, 12800][25600, 1]cuda:0" = linear_60[(slice(None, None, None), slice(12800, None, None))]; linear_60 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/common.py:64 in forward, code: return torch.index_select(x, dim=0, index=batch_indices) | |
| index_select_45: "bf16[4096, 12800][12800, 1]cuda:0" = torch.index_select(getitem_92, dim = 0, index = dummy_ads_to_user_map); getitem_92 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| res_ads_62: "bf16[4096, 12800][12800, 1]cuda:0" = linear_61 + index_select_45; linear_61 = index_select_45 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:181 in view_2d_to_3d, code: self.user.view(self.user.shape[0], num_emb, emb_dim), | |
| view_26: "bf16[4096, 40, 320][25600, 320, 1]cuda:0" = res_user_62.view(4096, -1, 320); res_user_62 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:182 in view_2d_to_3d, code: self.ads.view(self.ads.shape[0], num_emb, emb_dim), | |
| view_27: "bf16[4096, 40, 320][12800, 320, 1]cuda:0" = res_ads_62.view(4096, -1, 320); res_ads_62 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/common.py:64 in forward, code: return torch.index_select(x, dim=0, index=batch_indices) | |
| index_select_46: "f32[4096, 160, 128][20480, 128, 1]cuda:0" = torch.index_select(res_user_58, dim = 0, index = dummy_ads_to_user_map) | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:980 in forward, code: x_all = torch.cat( | |
| cat_17: "f32[4096, 320, 128][40960, 128, 1]cuda:0" = torch.cat([index_select_46, res_ads_58], dim = 1); index_select_46 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:989 in <lambda>, code: lambda: v_ads @ x_all | |
| matmul_38: "bf16[4096, 40, 128][5120, 128, 1]cuda:0" = view_27 @ cat_17; view_27 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:993 in <lambda>, code: lambda: v_user[:, :, :num_emb_user] @ x.user | |
| getitem_93: "bf16[4096, 40, 160][25600, 320, 1]cuda:0" = view_26[(slice(None, None, None), slice(None, None, None), slice(None, 160, None))]; view_26 = None | |
| matmul_39: "bf16[4096, 40, 128][5120, 128, 1]cuda:0" = getitem_93 @ res_user_58; getitem_93 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:1006 in <lambda>, code: lambda: vx_ads @ x_all.permute(0, 2, 1) | |
| permute_6: "f32[4096, 128, 320][40960, 1, 128]cuda:0" = cat_17.permute(0, 2, 1); cat_17 = None | |
| res_ads_63: "bf16[4096, 40, 320][12800, 320, 1]cuda:0" = matmul_38 @ permute_6; matmul_38 = permute_6 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:1010 in <lambda>, code: lambda: vx_user @ x.user.permute(0, 2, 1) | |
| permute_7: "f32[4096, 128, 160][20480, 1, 128]cuda:0" = res_user_58.permute(0, 2, 1) | |
| res_user_63: "bf16[4096, 40, 160][6400, 160, 1]cuda:0" = matmul_39 @ permute_7; matmul_39 = permute_7 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:189 in view_3d_to_2d, code: self.user.view(self.user.shape[0], emb_dim), | |
| view_28: "bf16[4096, 6400][6400, 1]cuda:0" = res_user_63.view(4096, -1); res_user_63 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:190 in view_3d_to_2d, code: self.ads.view(self.ads.shape[0], emb_dim), | |
| view_29: "bf16[4096, 12800][12800, 1]cuda:0" = res_ads_63.view(4096, -1); res_ads_63 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:208 in cat, code: torch.cat([t.user for t in tensors], dim=1), | |
| cat_18: "f32[4096, 10496][10496, 1]cuda:0" = torch.cat([view_28, res_user_1], dim = 1); view_28 = res_user_1 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:209 in cat, code: torch.cat([t.ads for t in tensors], dim=1), | |
| cat_19: "f32[4096, 16896][16896, 1]cuda:0" = torch.cat([view_29, res_ads_1], dim = 1); view_29 = res_ads_1 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torch/nn/functional.py:2958 in rms_norm, code: return torch.rms_norm(input, normalized_shape, weight, eps) | |
| res_user_64: "f32[4096, 10496][10496, 1]cuda:0" = torch.rms_norm(cat_18, (10496,), l_self_modules_wukong_modules_layers_modules_3_modules_ln1_modules_ln_user_parameters_weight_, None); cat_18 = l_self_modules_wukong_modules_layers_modules_3_modules_ln1_modules_ln_user_parameters_weight_ = None | |
| res_ads_64: "f32[4096, 16896][16896, 1]cuda:0" = torch.rms_norm(cat_19, (16896,), l_self_modules_wukong_modules_layers_modules_3_modules_ln1_modules_ln_ads_parameters_weight_, None); cat_19 = l_self_modules_wukong_modules_layers_modules_3_modules_ln1_modules_ln_ads_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:515 in <lambda>, code: res_shared = self.user_compute(lambda: self.linear_shared(x_user)) | |
| linear_62: "bf16[4096, 7680][7680, 1]cuda:0" = torch._C._nn.linear(res_user_64, l_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_fcs_modules_0_modules_linear_shared_parameters_weight_, None); res_user_64 = l_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_fcs_modules_0_modules_linear_shared_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:516 in forward_linear, code: res_user = res_shared[:, : self.num_out_user] | |
| res_user_65: "bf16[4096, 3840][7680, 1]cuda:0" = linear_62[(slice(None, None, None), slice(None, 3840, None))] | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| linear_63: "bf16[4096, 3840][3840, 1]cuda:0" = torch._C._nn.linear(res_ads_64, l_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_fcs_modules_0_modules_linear_ads_parameters_weight_, None); res_ads_64 = l_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_fcs_modules_0_modules_linear_ads_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:521 in <lambda>, code: self.broadcast(res_shared[:, self.num_out_user :], ads_to_user_map), | |
| getitem_95: "bf16[4096, 3840][7680, 1]cuda:0" = linear_62[(slice(None, None, None), slice(3840, None, None))]; linear_62 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/common.py:64 in forward, code: return torch.index_select(x, dim=0, index=batch_indices) | |
| index_select_47: "bf16[4096, 3840][3840, 1]cuda:0" = torch.index_select(getitem_95, dim = 0, index = dummy_ads_to_user_map); getitem_95 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| res_ads_65: "bf16[4096, 3840][3840, 1]cuda:0" = linear_63 + index_select_47; linear_63 = index_select_47 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torch/nn/functional.py:2958 in rms_norm, code: return torch.rms_norm(input, normalized_shape, weight, eps) | |
| output_46: "f32[4096, 3840][3840, 1]cuda:0" = torch.rms_norm(res_user_65, (3840,), l_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_acts_modules_0_modules_activation_modules_ln_user_modules_ln_parameters_weight_, None); res_user_65 = l_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_acts_modules_0_modules_activation_modules_ln_user_modules_ln_parameters_weight_ = None | |
| output_47: "f32[4096, 3840][3840, 1]cuda:0" = torch.rms_norm(res_ads_65, (3840,), l_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_acts_modules_0_modules_activation_modules_ln_ads_modules_ln_parameters_weight_, None); res_ads_65 = l_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_acts_modules_0_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:321 in forward, code: return self.activation(x) | |
| sigmoid_46: "f32[4096, 3840][3840, 1]cuda:0" = torch.sigmoid(output_46) | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:249 in forward, code: x_user * self.act(x_user), | |
| mul_46: "f32[4096, 3840][3840, 1]cuda:0" = output_46 * sigmoid_46; output_46 = sigmoid_46 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:321 in forward, code: return self.activation(x) | |
| sigmoid_47: "f32[4096, 3840][3840, 1]cuda:0" = torch.sigmoid(output_47) | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:250 in forward, code: x_ads * self.act(x_ads), | |
| mul_47: "f32[4096, 3840][3840, 1]cuda:0" = output_47 * sigmoid_47; output_47 = sigmoid_47 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:515 in <lambda>, code: res_shared = self.user_compute(lambda: self.linear_shared(x_user)) | |
| linear_64: "bf16[4096, 3840][3840, 1]cuda:0" = torch._C._nn.linear(mul_46, l_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_fcs_modules_1_modules_linear_shared_parameters_weight_, None); l_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_fcs_modules_1_modules_linear_shared_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:516 in forward_linear, code: res_user = res_shared[:, : self.num_out_user] | |
| res_user_66: "bf16[4096, 1920][3840, 1]cuda:0" = linear_64[(slice(None, None, None), slice(None, 1920, None))] | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| linear_65: "bf16[4096, 1920][1920, 1]cuda:0" = torch._C._nn.linear(mul_47, l_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_fcs_modules_1_modules_linear_ads_parameters_weight_, None); l_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_fcs_modules_1_modules_linear_ads_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:521 in <lambda>, code: self.broadcast(res_shared[:, self.num_out_user :], ads_to_user_map), | |
| getitem_97: "bf16[4096, 1920][3840, 1]cuda:0" = linear_64[(slice(None, None, None), slice(1920, None, None))]; linear_64 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/common.py:64 in forward, code: return torch.index_select(x, dim=0, index=batch_indices) | |
| index_select_48: "bf16[4096, 1920][1920, 1]cuda:0" = torch.index_select(getitem_97, dim = 0, index = dummy_ads_to_user_map); getitem_97 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| res_ads_66: "bf16[4096, 1920][1920, 1]cuda:0" = linear_65 + index_select_48; linear_65 = index_select_48 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torch/nn/functional.py:2958 in rms_norm, code: return torch.rms_norm(input, normalized_shape, weight, eps) | |
| output_48: "f32[4096, 1920][1920, 1]cuda:0" = torch.rms_norm(res_user_66, (1920,), l_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_acts_modules_1_modules_activation_modules_ln_user_modules_ln_parameters_weight_, None); res_user_66 = l_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_acts_modules_1_modules_activation_modules_ln_user_modules_ln_parameters_weight_ = None | |
| output_49: "f32[4096, 1920][1920, 1]cuda:0" = torch.rms_norm(res_ads_66, (1920,), l_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_acts_modules_1_modules_activation_modules_ln_ads_modules_ln_parameters_weight_, None); res_ads_66 = l_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_acts_modules_1_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:321 in forward, code: return self.activation(x) | |
| sigmoid_48: "f32[4096, 1920][1920, 1]cuda:0" = torch.sigmoid(output_48) | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:249 in forward, code: x_user * self.act(x_user), | |
| mul_48: "f32[4096, 1920][1920, 1]cuda:0" = output_48 * sigmoid_48; output_48 = sigmoid_48 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:321 in forward, code: return self.activation(x) | |
| sigmoid_49: "f32[4096, 1920][1920, 1]cuda:0" = torch.sigmoid(output_49) | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:250 in forward, code: x_ads * self.act(x_ads), | |
| mul_49: "f32[4096, 1920][1920, 1]cuda:0" = output_49 * sigmoid_49; output_49 = sigmoid_49 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:515 in <lambda>, code: res_shared = self.user_compute(lambda: self.linear_shared(x_user)) | |
| linear_66: "bf16[4096, 7680][7680, 1]cuda:0" = torch._C._nn.linear(mul_48, l_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_fcs_modules_2_modules_linear_shared_parameters_weight_, None); mul_48 = l_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_fcs_modules_2_modules_linear_shared_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:516 in forward_linear, code: res_user = res_shared[:, : self.num_out_user] | |
| res_user_67: "bf16[4096, 3840][7680, 1]cuda:0" = linear_66[(slice(None, None, None), slice(None, 3840, None))] | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| linear_67: "bf16[4096, 3840][3840, 1]cuda:0" = torch._C._nn.linear(mul_49, l_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_fcs_modules_2_modules_linear_ads_parameters_weight_, None); mul_49 = l_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_fcs_modules_2_modules_linear_ads_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:521 in <lambda>, code: self.broadcast(res_shared[:, self.num_out_user :], ads_to_user_map), | |
| getitem_99: "bf16[4096, 3840][7680, 1]cuda:0" = linear_66[(slice(None, None, None), slice(3840, None, None))]; linear_66 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/common.py:64 in forward, code: return torch.index_select(x, dim=0, index=batch_indices) | |
| index_select_49: "bf16[4096, 3840][3840, 1]cuda:0" = torch.index_select(getitem_99, dim = 0, index = dummy_ads_to_user_map); getitem_99 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| res_ads_67: "bf16[4096, 3840][3840, 1]cuda:0" = linear_67 + index_select_49; linear_67 = index_select_49 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:131 in __add__, code: res_user = self.user + other.user | |
| res_user_68: "f32[4096, 3840][3840, 1]cuda:0" = mul_46 + res_user_67; mul_46 = res_user_67 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:132 in __add__, code: res_ads = self.ads + other.ads | |
| res_ads_68: "f32[4096, 3840][3840, 1]cuda:0" = mul_47 + res_ads_67; mul_47 = res_ads_67 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torch/nn/functional.py:2958 in rms_norm, code: return torch.rms_norm(input, normalized_shape, weight, eps) | |
| output_50: "f32[4096, 3840][3840, 1]cuda:0" = torch.rms_norm(res_user_68, (3840,), l_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_acts_modules_2_modules_activation_modules_ln_user_modules_ln_parameters_weight_, None); res_user_68 = l_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_acts_modules_2_modules_activation_modules_ln_user_modules_ln_parameters_weight_ = None | |
| output_51: "f32[4096, 3840][3840, 1]cuda:0" = torch.rms_norm(res_ads_68, (3840,), l_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_acts_modules_2_modules_activation_modules_ln_ads_modules_ln_parameters_weight_, None); res_ads_68 = l_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_acts_modules_2_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:321 in forward, code: return self.activation(x) | |
| sigmoid_50: "f32[4096, 3840][3840, 1]cuda:0" = torch.sigmoid(output_50) | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:249 in forward, code: x_user * self.act(x_user), | |
| mul_50: "f32[4096, 3840][3840, 1]cuda:0" = output_50 * sigmoid_50; output_50 = sigmoid_50 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:321 in forward, code: return self.activation(x) | |
| sigmoid_51: "f32[4096, 3840][3840, 1]cuda:0" = torch.sigmoid(output_51) | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:250 in forward, code: x_ads * self.act(x_ads), | |
| mul_51: "f32[4096, 3840][3840, 1]cuda:0" = output_51 * sigmoid_51; output_51 = sigmoid_51 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:515 in <lambda>, code: res_shared = self.user_compute(lambda: self.linear_shared(x_user)) | |
| linear_68: "bf16[4096, 2560][2560, 1]cuda:0" = torch._C._nn.linear(mul_50, l_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_fcs_modules_3_modules_linear_shared_parameters_weight_, None); l_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_fcs_modules_3_modules_linear_shared_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:516 in forward_linear, code: res_user = res_shared[:, : self.num_out_user] | |
| res_user_69: "bf16[4096, 1280][2560, 1]cuda:0" = linear_68[(slice(None, None, None), slice(None, 1280, None))] | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| linear_69: "bf16[4096, 1280][1280, 1]cuda:0" = torch._C._nn.linear(mul_51, l_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_fcs_modules_3_modules_linear_ads_parameters_weight_, None); l_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_fcs_modules_3_modules_linear_ads_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:521 in <lambda>, code: self.broadcast(res_shared[:, self.num_out_user :], ads_to_user_map), | |
| getitem_101: "bf16[4096, 1280][2560, 1]cuda:0" = linear_68[(slice(None, None, None), slice(1280, None, None))]; linear_68 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/common.py:64 in forward, code: return torch.index_select(x, dim=0, index=batch_indices) | |
| index_select_50: "bf16[4096, 1280][1280, 1]cuda:0" = torch.index_select(getitem_101, dim = 0, index = dummy_ads_to_user_map); getitem_101 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| res_ads_69: "bf16[4096, 1280][1280, 1]cuda:0" = linear_69 + index_select_50; linear_69 = index_select_50 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torch/nn/functional.py:2958 in rms_norm, code: return torch.rms_norm(input, normalized_shape, weight, eps) | |
| output_52: "f32[4096, 1280][1280, 1]cuda:0" = torch.rms_norm(res_user_69, (1280,), l_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_acts_modules_3_modules_activation_modules_ln_user_modules_ln_parameters_weight_, None); res_user_69 = l_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_acts_modules_3_modules_activation_modules_ln_user_modules_ln_parameters_weight_ = None | |
| output_53: "f32[4096, 1280][1280, 1]cuda:0" = torch.rms_norm(res_ads_69, (1280,), l_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_acts_modules_3_modules_activation_modules_ln_ads_modules_ln_parameters_weight_, None); res_ads_69 = l_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_acts_modules_3_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:321 in forward, code: return self.activation(x) | |
| sigmoid_52: "f32[4096, 1280][1280, 1]cuda:0" = torch.sigmoid(output_52) | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:249 in forward, code: x_user * self.act(x_user), | |
| mul_52: "f32[4096, 1280][1280, 1]cuda:0" = output_52 * sigmoid_52; output_52 = sigmoid_52 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:321 in forward, code: return self.activation(x) | |
| sigmoid_53: "f32[4096, 1280][1280, 1]cuda:0" = torch.sigmoid(output_53) | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:250 in forward, code: x_ads * self.act(x_ads), | |
| mul_53: "f32[4096, 1280][1280, 1]cuda:0" = output_53 * sigmoid_53; output_53 = sigmoid_53 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:515 in <lambda>, code: res_shared = self.user_compute(lambda: self.linear_shared(x_user)) | |
| linear_70: "bf16[4096, 7680][7680, 1]cuda:0" = torch._C._nn.linear(mul_52, l_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_fcs_modules_4_modules_linear_shared_parameters_weight_, None); mul_52 = l_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_fcs_modules_4_modules_linear_shared_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:516 in forward_linear, code: res_user = res_shared[:, : self.num_out_user] | |
| res_user_70: "bf16[4096, 3840][7680, 1]cuda:0" = linear_70[(slice(None, None, None), slice(None, 3840, None))] | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| linear_71: "bf16[4096, 3840][3840, 1]cuda:0" = torch._C._nn.linear(mul_53, l_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_fcs_modules_4_modules_linear_ads_parameters_weight_, None); mul_53 = l_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_fcs_modules_4_modules_linear_ads_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:521 in <lambda>, code: self.broadcast(res_shared[:, self.num_out_user :], ads_to_user_map), | |
| getitem_103: "bf16[4096, 3840][7680, 1]cuda:0" = linear_70[(slice(None, None, None), slice(3840, None, None))]; linear_70 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/common.py:64 in forward, code: return torch.index_select(x, dim=0, index=batch_indices) | |
| index_select_51: "bf16[4096, 3840][3840, 1]cuda:0" = torch.index_select(getitem_103, dim = 0, index = dummy_ads_to_user_map); getitem_103 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:519 in <lambda>, code: lambda: self.linear_ads(x_ads) | |
| res_ads_70: "bf16[4096, 3840][3840, 1]cuda:0" = linear_71 + index_select_51; linear_71 = index_select_51 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:131 in __add__, code: res_user = self.user + other.user | |
| res_user_71: "f32[4096, 3840][3840, 1]cuda:0" = mul_50 + res_user_70; mul_50 = res_user_70 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:132 in __add__, code: res_ads = self.ads + other.ads | |
| res_ads_71: "f32[4096, 3840][3840, 1]cuda:0" = mul_51 + res_ads_70; mul_51 = res_ads_70 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torch/nn/functional.py:2958 in rms_norm, code: return torch.rms_norm(input, normalized_shape, weight, eps) | |
| output_54: "f32[4096, 3840][3840, 1]cuda:0" = torch.rms_norm(res_user_71, (3840,), l_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_acts_modules_4_modules_activation_modules_ln_user_modules_ln_parameters_weight_, None); res_user_71 = l_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_acts_modules_4_modules_activation_modules_ln_user_modules_ln_parameters_weight_ = None | |
| output_55: "f32[4096, 3840][3840, 1]cuda:0" = torch.rms_norm(res_ads_71, (3840,), l_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_acts_modules_4_modules_activation_modules_ln_ads_modules_ln_parameters_weight_, None); res_ads_71 = l_self_modules_wukong_modules_layers_modules_3_modules_bitmlp_modules_acts_modules_4_modules_activation_modules_ln_ads_modules_ln_parameters_weight_ = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:321 in forward, code: return self.activation(x) | |
| sigmoid_54: "f32[4096, 3840][3840, 1]cuda:0" = torch.sigmoid(output_54) | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:249 in forward, code: x_user * self.act(x_user), | |
| mul_54: "f32[4096, 3840][3840, 1]cuda:0" = output_54 * sigmoid_54; output_54 = sigmoid_54 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:321 in forward, code: return self.activation(x) | |
| sigmoid_55: "f32[4096, 3840][3840, 1]cuda:0" = torch.sigmoid(output_55) | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/torchrec/fb/wukong/prod/ads/wukong_rocs.py:250 in forward, code: x_ads * self.act(x_ads), | |
| wukong_output: "f32[4096, 3840][3840, 1]cuda:0" = output_55 * sigmoid_55; output_55 = sigmoid_55 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/aps_models/ads/gmp/models/mtml_ctr_instagram_model/managed/Y2026Q1/mtml_ctr_instagram_model_827295674_v0_fork.py:669 in forward, code: wukong_output = wukong_output.flatten(start_dim=1) | |
| wukong_output_1: "f32[4096, 3840][3840, 1]cuda:0" = wukong_output.flatten(start_dim = 1) | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/aps_models/ads/gmp/models/mtml_ctr_instagram_model/managed/Y2026Q1/mtml_ctr_instagram_model_827295674_v0_fork.py:672 in forward, code: "user": v.user.flatten(start_dim=1).size(1), | |
| flatten_1: "f32[4096, 4608][4608, 1]cuda:0" = mul_12.flatten(start_dim = 1); flatten_1 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/aps_models/ads/gmp/models/mtml_ctr_instagram_model/managed/Y2026Q1/mtml_ctr_instagram_model_827295674_v0_fork.py:673 in forward, code: "ad": v.ads.flatten(start_dim=1).size(1), | |
| flatten_2: "f32[4096, 4608][4608, 1]cuda:0" = mul_13.flatten(start_dim = 1); flatten_2 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/aps_models/ads/gmp/models/mtml_ctr_instagram_model/managed/Y2026Q1/mtml_ctr_instagram_model_827295674_v0_fork.py:672 in forward, code: "user": v.user.flatten(start_dim=1).size(1), | |
| flatten_3: "f32[4096, 24576][24576, 1]cuda:0" = res_user_20.flatten(start_dim = 1); flatten_3 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/aps_models/ads/gmp/models/mtml_ctr_instagram_model/managed/Y2026Q1/mtml_ctr_instagram_model_827295674_v0_fork.py:673 in forward, code: "ad": v.ads.flatten(start_dim=1).size(1), | |
| flatten_4: "f32[4096, 24576][24576, 1]cuda:0" = res_ads_20.flatten(start_dim = 1); flatten_4 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/aps_models/ads/gmp/models/mtml_ctr_instagram_model/managed/Y2026Q1/mtml_ctr_instagram_model_827295674_v0_fork.py:672 in forward, code: "user": v.user.flatten(start_dim=1).size(1), | |
| flatten_5: "f32[4096, 20480][20480, 1]cuda:0" = res_user_39.flatten(start_dim = 1); flatten_5 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/aps_models/ads/gmp/models/mtml_ctr_instagram_model/managed/Y2026Q1/mtml_ctr_instagram_model_827295674_v0_fork.py:673 in forward, code: "ad": v.ads.flatten(start_dim=1).size(1), | |
| flatten_6: "f32[4096, 20480][20480, 1]cuda:0" = res_ads_39.flatten(start_dim = 1); flatten_6 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/aps_models/ads/gmp/models/mtml_ctr_instagram_model/managed/Y2026Q1/mtml_ctr_instagram_model_827295674_v0_fork.py:672 in forward, code: "user": v.user.flatten(start_dim=1).size(1), | |
| flatten_7: "f32[4096, 20480][20480, 1]cuda:0" = res_user_58.flatten(start_dim = 1); flatten_7 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/aps_models/ads/gmp/models/mtml_ctr_instagram_model/managed/Y2026Q1/mtml_ctr_instagram_model_827295674_v0_fork.py:673 in forward, code: "ad": v.ads.flatten(start_dim=1).size(1), | |
| flatten_8: "f32[4096, 20480][20480, 1]cuda:0" = res_ads_58.flatten(start_dim = 1); flatten_8 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/aps_models/ads/gmp/models/mtml_ctr_instagram_model/managed/Y2026Q1/mtml_ctr_instagram_model_827295674_v0_fork.py:672 in forward, code: "user": v.user.flatten(start_dim=1).size(1), | |
| flatten_9: "f32[4096, 3840][3840, 1]cuda:0" = mul_54.flatten(start_dim = 1); flatten_9 = None | |
| # File: /data/users/shunting/fbsource/buck-out/v2/art/fbcode/ccb28570d24d5484/ads_training/p9e/component_benchmarking/v2/__cb_cli_aps__/cb_cli_aps#link-tree/aps_models/ads/gmp/models/mtml_ctr_instagram_model/managed/Y2026Q1/mtml_ctr_instagram_model_827295674_v0_fork.py:673 in forward, code: "ad": v.ads.flatten(start_dim=1).size(1), | |
| flatten_10: "f32[4096, 3840][3840, 1]cuda:0" = wukong_output.flatten(start_dim = 1); flatten_10 = None | |
| return (mul_12, mul_13, dummy_ads_to_user_map, res_user_20, res_ads_20, res_user_39, res_ads_39, res_user_58, res_ads_58, mul_54, wukong_output, wukong_output_1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment