2021-09-14 16:33:50.905443: W tensorflow/compiler/xla/service/gpu/gpu_executable.cc:303] PROFILING: profiling is enabled
2021-09-14 16:33:50.919607: I tensorflow/compiler/xla/service/executable.cc:221] Execution profile for cluster_0__XlaCompiledKernel_true__XlaNumConstantArgs_7__XlaNumResourceArgs_0_.2111: (12.3 ms @ f_nom)
2021-09-14 16:33:50.919633: I tensorflow/compiler/xla/service/executable.cc:221] 17274099 cycles (100.% 100Σ) :: 12251.1 usec ( 692.4 optimal) :: 61.77GFLOP/s :: 4.62GTROP/s :: 81.86GiB/s :: 62B/cycle :: [total] [entry]
2021-09-14 16:33:50.919644: I tensorflow/compiler/xla/service/executable.cc:221] 151601 cycles ( 0.88% 1Σ) :: 107.5 usec :: :: :: :: :: %custom-call.1 = f16[1024,768]{1,0} custom-call(f16[1024,768]{1,0} %fusion.180, f16[768,768]{1,0} %constant_23, f16[1024,768]{1,0} %broadcast.26), custom_call_target="__cublas$gemm", metadata={op_type="BiasAdd" op_name="bert/encoder/layer_0/attention/self/query/BiasAdd"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":1,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"111\"}"
2021-09-14 16:33:50.919656: I tensorflow/compiler/xla/service/executable.cc:221] 90960 cycles ( 0.53% 1Σ) :: 64.5 usec :: :: :: :: :: %custom-call.32 = f16[1024,768]{1,0} custom-call(f16[1024,3072]{1,0} %fusion.139, f16[3072,768]{1,0} %constant_478), custom_call_target="__cublas$gemm", metadata={op_type="MatMul" op_name="bert/encoder/layer_2/output/dense/MatMul"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"109\"}"
2021-09-14 16:33:50.919662: I tensorflow/compiler/xla/service/executable.cc:221] 90960 cycles ( 0.53% 2Σ) :: 64.5 usec :: :: :: :: :: %custom-call.10 = f16[1024,768]{1,0} custom-call(f16[1024,3072]{1,0} %fusion.169, f16[3072,768]{1,0} %constant_132), custom_call_target="__cublas$gemm", metadata={op_type="MatMul" op_name="bert/encoder/layer_0/output/dense/MatMul"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"109\"}"
2021-09-14 16:33:50.919667: I tensorflow/compiler/xla/service/executable.cc:221] 89516 cycles ( 0.52% 2Σ) :: 63.5 usec :: :: :: :: :: %custom-call.131 = f16[1024,768]{1,0} custom-call(f16[1024,3072]{1,0} %fusion.4, f16[3072,768]{1,0} %constant_2035), custom_call_target="__cublas$gemm", metadata={op_type="MatMul" op_name="bert/encoder/layer_11/output/dense/MatMul"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"109\"}"
2021-09-14 16:33:50.919673: I tensorflow/compiler/xla/service/executable.cc:221] 89516 cycles ( 0.52% 3Σ) :: 63.5 usec :: :: :: :: :: %custom-call.43 = f16[1024,768]{1,0} custom-call(f16[1024,3072]{1,0} %fusion.124, f16[3072,768]{1,0} %constant_651), custom_call_target="__cublas$gemm", metadata={op_type="MatMul" op_name="bert/encoder/layer_3/output/dense/MatMul"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"109\"}"
2021-09-14 16:33:50.919678: I tensorflow/compiler/xla/service/executable.cc:221] 89516 cycles ( 0.52% 3Σ) :: 63.5 usec :: :: :: :: :: %custom-call.120 = f16[1024,768]{1,0} custom-call(f16[1024,3072]{1,0} %fusion.19, f16[3072,768]{1,0} %constant_1862), custom_call_target="__cublas$gemm", metadata={op_type="MatMul" op_name="bert/encoder/layer_10/output/dense/MatMul"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"109\"}"
2021-09-14 16:33:50.919690: I tensorflow/compiler/xla/service/executable.cc:221] 89516 cycles ( 0.52% 4Σ) :: 63.5 usec :: :: :: :: :: %custom-call.109 = f16[1024,768]{1,0} custom-call(f16[1024,3072]{1,0} %fusion.34, f16[3072,768]{1,0} %constant_1689), custom_call_target="__cublas$gemm", metadata={op_type="MatMul" op_name="bert/encoder/layer_9/output/dense/MatMul"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"109\"}"
2021-09-14 16:33:50.919696: I tensorflow/compiler/xla/service/executable.cc:221] 89516 cycles ( 0.52% 5Σ) :: 63.5 usec :: :: :: :: :: %custom-call.21 = f16[1024,768]{1,0} custom-call(f16[1024,3072]{1,0} %fusion.154, f16[3072,768]{1,0} %constant_305), custom_call_target="__cublas$gemm", metadata={op_type="MatMul" op_name="bert/encoder/layer_1/output/dense/MatMul"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"109\"}"
2021-09-14 16:33:50.919701: I tensorflow/compiler/xla/service/executable.cc:221] 89516 cycles ( 0.52% 5Σ) :: 63.5 usec :: :: :: :: :: %custom-call.87 = f16[1024,768]{1,0} custom-call(f16[1024,3072]{1,0} %fusion.64, f16[3072,768]{1,0} %constant_1343), custom_call_target="__cublas$gemm", metadata={op_type="MatMul" op_name="bert/encoder/layer_7/output/dense/MatMul"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"109\"}"
2021-09-14 16:33:50.919706: I tensorflow/compiler/xla/service/executable.cc:221] 89516 cycles ( 0.52% 6Σ) :: 63.5 usec :: :: :: :: :: %custom-call.54 = f16[1024,768]{1,0} custom-call(f16[1024,3072]{1,0} %fusion.109, f16[3072,768]{1,0} %constant_824), custom_call_target="__cublas$gemm", metadata={op_type="MatMul" op_name="bert/encoder/layer_4/output/dense/MatMul"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"109\"}"
2021-09-14 16:33:50.919713: I tensorflow/compiler/xla/service/executable.cc:221] 88072 cycles ( 0.51% 6Σ) :: 62.5 usec :: :: :: :: :: %custom-call.65 = f16[1024,768]{1,0} custom-call(f16[1024,3072]{1,0} %fusion.94, f16[3072,768]{1,0} %constant_997), custom_call_target="__cublas$gemm", metadata={op_type="MatMul" op_name="bert/encoder/layer_5/output/dense/MatMul"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"109\"}"
2021-09-14 16:33:50.919718: I tensorflow/compiler/xla/service/executable.cc:221] 88072 cycles ( 0.51% 7Σ) :: 62.5 usec :: :: :: :: :: %custom-call.98 = f16[1024,768]{1,0} custom-call(f16[1024,3072]{1,0} %fusion.49, f16[3072,768]{1,0} %constant_1516), custom_call_target="__cublas$gemm", metadata={op_type="MatMul" op_name="bert/encoder/layer_8/output/dense/MatMul"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"109\"}"
2021-09-14 16:33:50.919727: I tensorflow/compiler/xla/service/executable.cc:221] 88072 cycles ( 0.51% 7Σ) :: 62.5 usec :: :: :: :: :: %custom-call.76 = f16[1024,768]{1,0} custom-call(f16[1024,3072]{1,0} %fusion.79, f16[3072,768]{1,0} %constant_1170), custom_call_target="__cublas$gemm", metadata={op_type="MatMul" op_name="bert/encoder/layer_6/output/dense/MatMul"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"109\"}"
2021-09-14 16:33:50.919735: I tensorflow/compiler/xla/service/executable.cc:221] 77967 cycles ( 0.45% 8Σ) :: 55.3 usec :: :: :: :: :: %custom-call.9 = f16[1024,3072]{1,0} custom-call(f16[1024,768]{1,0} %get-tuple-element.134, f16[768,3072]{1,0} %constant_107), custom_call_target="__cublas$gemm", metadata={op_type="MatMul" op_name="bert/encoder/layer_0/intermediate/dense/MatMul"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"0\"}"
2021-09-14 16:33:50.919741: I tensorflow/compiler/xla/service/executable.cc:221] 67860 cycles ( 0.39% 8Σ) :: 48.1 usec :: :: :: :: :: %custom-call.20 = f16[1024,3072]{1,0} custom-call(f16[1024,768]{1,0} %get-tuple-element.122, f16[768,3072]{1,0} %constant_280), custom_call_target="__cublas$gemm", metadata={op_type="MatMul" op_name="bert/encoder/layer_1/intermediate/dense/MatMul"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"0\"}"
2021-09-14 16:33:50.919746: I tensorflow/compiler/xla/service/executable.cc:221] 66416 cycles ( 0.38% 8Σ) :: 47.1 usec :: :: :: :: :: %custom-call.31 = f16[1024,3072]{1,0} custom-call(f16[1024,768]{1,0} %get-tuple-element.110, f16[768,3072]{1,0} %constant_453), custom_call_target="__cublas$gemm", metadata={op_type="MatMul" op_name="bert/encoder/layer_2/intermediate/dense/MatMul"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"0\"}"
2021-09-14 16:33:50.919752: I tensorflow/compiler/xla/service/executable.cc:221] 66416 cycles ( 0.38% 9Σ) :: 47.1 usec :: :: :: :: :: %custom-call.119 = f16[1024,3072]{1,0} custom-call(f16[1024,768]{1,0} %get-tuple-element.14, f16[768,3072]{1,0} %constant_1837), custom_call_target="__cublas$gemm", metadata={op_type="MatMul" op_name="bert/encoder/layer_10/intermediate/dense/MatMul"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"0\"}"
2021-09-14 16:33:50.919760: I tensorflow/compiler/xla/service/executable.cc:221] 66416 cycles ( 0.38% 9Σ) :: 47.1 usec :: :: :: :: :: %custom-call.64 = f16[1024,3072]{1,0} custom-call(f16[1024,768]{1,0} %get-tuple-element.74, f16[768,3072]{1,0} %constant_972), custom_call_target="__cublas$gemm", metadata={op_type="MatMul" op_name="bert/encoder/layer_5/intermediate/dense/MatMul"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"0\"}"
2021-09-14 16:33:50.919765: I tensorflow/compiler/xla/service/executable.cc:221] 66416 cycles ( 0.38% 9Σ) :: 47.1 usec :: :: :: :: :: %custom-call.75 = f16[1024,3072]{1,0} custom-call(f16[1024,768]{1,0} %get-tuple-element.62, f16[768,3072]{1,0} %constant_1145), custom_call_target="__cublas$gemm", metadata={op_type="MatMul" op_name="bert/encoder/layer_6/intermediate/dense/MatMul"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"0\"}"
2021-09-14 16:33:50.919771: I tensorflow/compiler/xla/service/executable.cc:221] 66416 cycles ( 0.38% 10Σ) :: 47.1 usec :: :: :: :: :: %custom-call.130 = f16[1024,3072]{1,0} custom-call(f16[1024,768]{1,0} %get-tuple-element.2, f16[768,3072]{1,0} %constant_2010), custom_call_target="__cublas$gemm", metadata={op_type="MatMul" op_name="bert/encoder/layer_11/intermediate/dense/MatMul"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"0\"}"
2021-09-14 16:33:50.919776: I tensorflow/compiler/xla/service/executable.cc:221] 66416 cycles ( 0.38% 10Σ) :: 47.1 usec :: :: :: :: :: %custom-call.86 = f16[1024,3072]{1,0} custom-call(f16[1024,768]{1,0} %get-tuple-element.50, f16[768,3072]{1,0} %constant_1318), custom_call_target="__cublas$gemm", metadata={op_type="MatMul" op_name="bert/encoder/layer_7/intermediate/dense/MatMul"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"0\"}"
2021-09-14 16:33:50.919782: I tensorflow/compiler/xla/service/executable.cc:221] 66416 cycles ( 0.38% 11Σ) :: 47.1 usec :: :: :: :: :: %custom-call.42 = f16[1024,3072]{1,0} custom-call(f16[1024,768]{1,0} %get-tuple-element.98, f16[768,3072]{1,0} %constant_626), custom_call_target="__cublas$gemm", metadata={op_type="MatMul" op_name="bert/encoder/layer_3/intermediate/dense/MatMul"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"0\"}"
2021-09-14 16:33:50.919787: I tensorflow/compiler/xla/service/executable.cc:221] 66416 cycles ( 0.38% 11Σ) :: 47.1 usec :: :: :: :: :: %custom-call.53 = f16[1024,3072]{1,0} custom-call(f16[1024,768]{1,0} %get-tuple-element.86, f16[768,3072]{1,0} %constant_799), custom_call_target="__cublas$gemm", metadata={op_type="MatMul" op_name="bert/encoder/layer_4/intermediate/dense/MatMul"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"0\"}"
2021-09-14 16:33:50.919796: I tensorflow/compiler/xla/service/executable.cc:221] 64972 cycles ( 0.38% 11Σ) :: 46.1 usec :: :: :: :: :: %custom-call.4 = f16[8,12,128,128]{3,2,1,0} custom-call(f16[8,12,128,64]{3,2,1,0} %fusion.179, f16[8,12,128,64]{3,2,1,0} %fusion.178), custom_call_target="__cublas$gemm", metadata={op_type="BatchMatMul" op_name="bert/encoder/layer_0/attention/self/MatMul"}, backend_config="{\"alpha_real\":0.125,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"3\"],\"rhs_contracting_dimensions\":[\"3\"],\"lhs_batch_dimensions\":[\"0\",\"1\"],\"rhs_batch_dimensions\":[\"0\",\"1\"]},\"batch_size\":\"96\"}"
2021-09-14 16:33:50.919801: I tensorflow/compiler/xla/service/executable.cc:221] 64972 cycles ( 0.38% 12Σ) :: 46.1 usec :: :: :: :: :: %custom-call.108 = f16[1024,3072]{1,0} custom-call(f16[1024,768]{1,0} %get-tuple-element.26, f16[768,3072]{1,0} %constant_1664), custom_call_target="__cublas$gemm", metadata={op_type="MatMul" op_name="bert/encoder/layer_9/intermediate/dense/MatMul"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"0\"}"
2021-09-14 16:33:50.919806: I tensorflow/compiler/xla/service/executable.cc:221] 64972 cycles ( 0.38% 12Σ) :: 46.1 usec :: :: :: :: :: %custom-call.97 = f16[1024,3072]{1,0} custom-call(f16[1024,768]{1,0} %get-tuple-element.38, f16[768,3072]{1,0} %constant_1491), custom_call_target="__cublas$gemm", metadata={op_type="MatMul" op_name="bert/encoder/layer_8/intermediate/dense/MatMul"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"0\"}"
2021-09-14 16:33:50.919811: I tensorflow/compiler/xla/service/executable.cc:221] 57752 cycles ( 0.33% 12Σ) :: 41.0 usec :: :: :: :: :: %custom-call.3 = f16[1024,768]{1,0} custom-call(f16[1024,768]{1,0} %fusion.180, f16[768,768]{1,0} %constant_16, f16[1024,768]{1,0} %broadcast.19), custom_call_target="__cublas$gemm", metadata={op_type="BiasAdd" op_name="bert/encoder/layer_0/attention/self/key/BiasAdd"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":1,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"111\"}"
2021-09-14 16:33:50.919817: I tensorflow/compiler/xla/service/executable.cc:221] 56308 cycles ( 0.33% 13Σ) :: 39.9 usec :: :: :: :: :: %custom-call.6 = f16[1024,768]{1,0} custom-call(f16[1024,768]{1,0} %fusion.180, f16[768,768]{1,0} %constant_39, f16[1024,768]{1,0} %broadcast.42), custom_call_target="__cublas$gemm", metadata={op_type="BiasAdd" op_name="bert/encoder/layer_0/attention/self/value/BiasAdd"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":1,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"111\"}"
2021-09-14 16:33:50.919825: I tensorflow/compiler/xla/service/executable.cc:221] 53422 cycles ( 0.31% 13Σ) :: 37.9 usec :: :: :: :: :: %custom-call.17 = f16[1024,768]{1,0} custom-call(f16[1024,768]{1,0} %get-tuple-element.130, f16[768,768]{1,0} %constant_212, f16[1024,768]{1,0} %broadcast.215), custom_call_target="__cublas$gemm", metadata={op_type="BiasAdd" op_name="bert/encoder/layer_1/attention/self/value/BiasAdd"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":1,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"111\"}"
2021-09-14 16:33:50.919830: I tensorflow/compiler/xla/service/executable.cc:221] 53422 cycles ( 0.31% 13Σ) :: 37.9 usec ( 6.4 optimal) :: 93.08GFLOP/s :: :: 245.47GiB/s :: 186B/cycle :: %fusion.252 = (f16[8,12,128]{2,1,0}, f16[8,12,128,128]{3,2,1,0}, f16[8,12,128,128]{3,2,1,0}) fusion(f16[8,12,128,128]{3,2,1,0} %custom-call.4, f32[8,128,128]{2,1,0} %arg0.1), kind=kInput, calls=%fused_computation.252, metadata={op_type="Softmax" op_name="bert/encoder/layer_0/attention/self/Softmax"}
2021-09-14 16:33:50.919836: I tensorflow/compiler/xla/service/executable.cc:221] 51978 cycles ( 0.30% 14Σ) :: 36.9 usec :: :: :: :: :: %custom-call.127 = f16[1024,768]{1,0} custom-call(f16[1024,768]{1,0} %get-tuple-element.10, f16[768,768]{1,0} %constant_1942, f16[1024,768]{1,0} %broadcast.1945), custom_call_target="__cublas$gemm", metadata={op_type="BiasAdd" op_name="bert/encoder/layer_11/attention/self/value/BiasAdd"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":1,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"111\"}"
2021-09-14 16:33:50.919841: I tensorflow/compiler/xla/service/executable.cc:221] 51978 cycles ( 0.30% 14Σ) :: 36.9 usec :: :: :: :: :: %custom-call.50 = f16[1024,768]{1,0} custom-call(f16[1024,768]{1,0} %get-tuple-element.94, f16[768,768]{1,0} %constant_731, f16[1024,768]{1,0} %broadcast.734), custom_call_target="__cublas$gemm", metadata={op_type="BiasAdd" op_name="bert/encoder/layer_4/attention/self/value/BiasAdd"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":1,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"111\"}"
2021-09-14 16:33:50.919847: I tensorflow/compiler/xla/service/executable.cc:221] 51978 cycles ( 0.30% 14Σ) :: 36.9 usec :: :: :: :: :: %custom-call.28 = f16[1024,768]{1,0} custom-call(f16[1024,768]{1,0} %get-tuple-element.118, f16[768,768]{1,0} %constant_385, f16[1024,768]{1,0} %broadcast.388), custom_call_target="__cublas$gemm", metadata={op_type="BiasAdd" op_name="bert/encoder/layer_2/attention/self/value/BiasAdd"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":1,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"111\"}"
2021-09-14 16:33:50.919857: I tensorflow/compiler/xla/service/executable.cc:221] 51978 cycles ( 0.30% 15Σ) :: 36.9 usec :: :: :: :: :: %custom-call.105 = f16[1024,768]{1,0} custom-call(f16[1024,768]{1,0} %get-tuple-element.34, f16[768,768]{1,0} %constant_1596, f16[1024,768]{1,0} %broadcast.1599), custom_call_target="__cublas$gemm", metadata={op_type="BiasAdd" op_name="bert/encoder/layer_9/attention/self/value/BiasAdd"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":1,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"111\"}"
2021-09-14 16:33:50.919862: I tensorflow/compiler/xla/service/executable.cc:221] 51978 cycles ( 0.30% 15Σ) :: 36.9 usec :: :: :: :: :: %custom-call.78 = f16[1024,768]{1,0} custom-call(f16[1024,768]{1,0} %get-tuple-element.58, f16[768,768]{1,0} %constant_1234, f16[1024,768]{1,0} %broadcast.1237), custom_call_target="__cublas$gemm", metadata={op_type="BiasAdd" op_name="bert/encoder/layer_7/attention/self/query/BiasAdd"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":1,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"111\"}"
2021-09-14 16:33:50.919867: I tensorflow/compiler/xla/service/executable.cc:221] 51978 cycles ( 0.30% 15Σ) :: 36.9 usec :: :: :: :: :: %custom-call.72 = f16[1024,768]{1,0} custom-call(f16[1024,768]{1,0} %get-tuple-element.70, f16[768,768]{1,0} %constant_1077, f16[1024,768]{1,0} %broadcast.1080), custom_call_target="__cublas$gemm", metadata={op_type="BiasAdd" op_name="bert/encoder/layer_6/attention/self/value/BiasAdd"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":1,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"111\"}"
2021-09-14 16:33:50.919873: I tensorflow/compiler/xla/service/executable.cc:221] 51978 cycles ( 0.30% 16Σ) :: 36.9 usec :: :: :: :: :: %custom-call.34 = f16[1024,768]{1,0} custom-call(f16[1024,768]{1,0} %get-tuple-element.106, f16[768,768]{1,0} %constant_542, f16[1024,768]{1,0} %broadcast.545), custom_call_target="__cublas$gemm", metadata={op_type="BiasAdd" op_name="bert/encoder/layer_3/attention/self/query/BiasAdd"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":1,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"111\"}"
2021-09-14 16:33:50.919878: I tensorflow/compiler/xla/service/executable.cc:221] 51978 cycles ( 0.30% 16Σ) :: 36.9 usec :: :: :: :: :: %custom-call.8 = f16[1024,768]{1,0} custom-call(f16[1024,768]{1,0} %fusion.174, f16[768,768]{1,0} %constant_50), custom_call_target="__cublas$gemm", metadata={op_type="MatMul" op_name="bert/encoder/layer_0/attention/output/dense/MatMul"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"109\"}"
2021-09-14 16:33:50.919886: I tensorflow/compiler/xla/service/executable.cc:221] 51978 cycles ( 0.30% 16Σ) :: 36.9 usec :: :: :: :: :: %custom-call.94 = f16[1024,768]{1,0} custom-call(f16[1024,768]{1,0} %get-tuple-element.46, f16[768,768]{1,0} %constant_1423, f16[1024,768]{1,0} %broadcast.1426), custom_call_target="__cublas$gemm", metadata={op_type="BiasAdd" op_name="bert/encoder/layer_8/attention/self/value/BiasAdd"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":1,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"111\"}"
2021-09-14 16:33:50.919891: I tensorflow/compiler/xla/service/executable.cc:221] 51978 cycles ( 0.30% 16Σ) :: 36.9 usec :: :: :: :: :: %custom-call.39 = f16[1024,768]{1,0} custom-call(f16[1024,768]{1,0} %get-tuple-element.106, f16[768,768]{1,0} %constant_558, f16[1024,768]{1,0} %broadcast.561), custom_call_target="__cublas$gemm", metadata={op_type="BiasAdd" op_name="bert/encoder/layer_3/attention/self/value/BiasAdd"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":1,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"111\"}"
2021-09-14 16:33:50.919897: I tensorflow/compiler/xla/service/executable.cc:221] 51978 cycles ( 0.30% 17Σ) :: 36.9 usec :: :: :: :: :: %custom-call.12 = f16[1024,768]{1,0} custom-call(f16[1024,768]{1,0} %get-tuple-element.130, f16[768,768]{1,0} %constant_196, f16[1024,768]{1,0} %broadcast.199), custom_call_target="__cublas$gemm", metadata={op_type="BiasAdd" op_name="bert/encoder/layer_1/attention/self/query/BiasAdd"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":1,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"111\"}"
2021-09-14 16:33:50.919902: I tensorflow/compiler/xla/service/executable.cc:221] 51978 cycles ( 0.30% 17Σ) :: 36.9 usec :: :: :: :: :: %custom-call.45 = f16[1024,768]{1,0} custom-call(f16[1024,768]{1,0} %get-tuple-element.94, f16[768,768]{1,0} %constant_715, f16[1024,768]{1,0} %broadcast.718), custom_call_target="__cublas$gemm", metadata={op_type="BiasAdd" op_name="bert/encoder/layer_4/attention/self/query/BiasAdd"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":1,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"111\"}"
2021-09-14 16:33:50.919908: I tensorflow/compiler/xla/service/executable.cc:221] 50534 cycles ( 0.29% 17Σ) :: 35.8 usec :: :: :: :: :: %custom-call.36 = f16[1024,768]{1,0} custom-call(f16[1024,768]{1,0} %get-tuple-element.106, f16[768,768]{1,0} %constant_535, f16[1024,768]{1,0} %broadcast.538), custom_call_target="__cublas$gemm", metadata={op_type="BiasAdd" op_name="bert/encoder/layer_3/attention/self/key/BiasAdd"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":1,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"111\"}"
2021-09-14 16:33:50.919920: I tensorflow/compiler/xla/service/executable.cc:221] 50534 cycles ( 0.29% 18Σ) :: 35.8 usec :: :: :: :: :: %custom-call.116 = f16[1024,768]{1,0} custom-call(f16[1024,768]{1,0} %get-tuple-element.22, f16[768,768]{1,0} %constant_1769, f16[1024,768]{1,0} %broadcast.1772), custom_call_target="__cublas$gemm", metadata={op_type="BiasAdd" op_name="bert/encoder/layer_10/attention/self/value/BiasAdd"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":1,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"111\"}"
2021-09-14 16:33:50.919931: I tensorflow/compiler/xla/service/executable.cc:221] 50534 cycles ( 0.29% 18Σ) :: 35.8 usec :: :: :: :: :: %custom-call.67 = f16[1024,768]{1,0} custom-call(f16[1024,768]{1,0} %get-tuple-element.70, f16[768,768]{1,0} %constant_1061, f16[1024,768]{1,0} %broadcast.1064), custom_call_target="__cublas$gemm", metadata={op_type="BiasAdd" op_name="bert/encoder/layer_6/attention/self/query/BiasAdd"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":1,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"111\"}"
2021-09-14 16:33:50.919936: I tensorflow/compiler/xla/service/executable.cc:221] 50534 cycles ( 0.29% 18Σ) :: 35.8 usec :: :: :: :: :: %custom-call.122 = f16[1024,768]{1,0} custom-call(f16[1024,768]{1,0} %get-tuple-element.10, f16[768,768]{1,0} %constant_1926, f16[1024,768]{1,0} %broadcast.1929), custom_call_target="__cublas$gemm", metadata={op_type="BiasAdd" op_name="bert/encoder/layer_11/attention/self/query/BiasAdd"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":1,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"111\"}"
2021-09-14 16:33:50.919942: I tensorflow/compiler/xla/service/executable.cc:221] 50534 cycles ( 0.29% 18Σ) :: 35.8 usec :: :: :: :: :: %custom-call.61 = f16[1024,768]{1,0} custom-call(f16[1024,768]{1,0} %get-tuple-element.82, f16[768,768]{1,0} %constant_904, f16[1024,768]{1,0} %broadcast.907), custom_call_target="__cublas$gemm", metadata={op_type="BiasAdd" op_name="bert/encoder/layer_5/attention/self/value/BiasAdd"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":1,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"111\"}"
2021-09-14 16:33:50.919947: I tensorflow/compiler/xla/service/executable.cc:221] 50534 cycles ( 0.29% 19Σ) :: 35.8 usec :: :: :: :: :: %custom-call.80 = f16[1024,768]{1,0} custom-call(f16[1024,768]{1,0} %get-tuple-element.58, f16[768,768]{1,0} %constant_1227, f16[1024,768]{1,0} %broadcast.1230), custom_call_target="__cublas$gemm", metadata={op_type="BiasAdd" op_name="bert/encoder/layer_7/attention/self/key/BiasAdd"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":1,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"111\"}"
2021-09-14 16:33:50.919956: I tensorflow/compiler/xla/service/executable.cc:221] 50534 cycles ( 0.29% 19Σ) :: 35.8 usec :: :: :: :: :: %custom-call.100 = f16[1024,768]{1,0} custom-call(f16[1024,768]{1,0} %get-tuple-element.34, f16[768,768]{1,0} %constant_1580, f16[1024,768]{1,0} %broadcast.1583), custom_call_target="__cublas$gemm", metadata={op_type="BiasAdd" op_name="bert/encoder/layer_9/attention/self/query/BiasAdd"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":1,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"111\"}"
2021-09-14 16:33:50.919961: I tensorflow/compiler/xla/service/executable.cc:221] 50534 cycles ( 0.29% 19Σ) :: 35.8 usec :: :: :: :: :: %custom-call.69 = f16[1024,768]{1,0} custom-call(f16[1024,768]{1,0} %get-tuple-element.70, f16[768,768]{1,0} %constant_1054, f16[1024,768]{1,0} %broadcast.1057), custom_call_target="__cublas$gemm", metadata={op_type="BiasAdd" op_name="bert/encoder/layer_6/attention/self/key/BiasAdd"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":1,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"111\"}"
2021-09-14 16:33:50.919966: I tensorflow/compiler/xla/service/executable.cc:221] 50534 cycles ( 0.29% 20Σ) :: 35.8 usec :: :: :: :: :: %custom-call.23 = f16[1024,768]{1,0} custom-call(f16[1024,768]{1,0} %get-tuple-element.118, f16[768,768]{1,0} %constant_369, f16[1024,768]{1,0} %broadcast.372), custom_call_target="__cublas$gemm", metadata={op_type="BiasAdd" op_name="bert/encoder/layer_2/attention/self/query/BiasAdd"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":1,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"111\"}"
2021-09-14 16:33:50.919972: I tensorflow/compiler/xla/service/executable.cc:221] 50534 cycles ( 0.29% 20Σ) :: 35.8 usec :: :: :: :: :: %custom-call.14 = f16[1024,768]{1,0} custom-call(f16[1024,768]{1,0} %get-tuple-element.130, f16[768,768]{1,0} %constant_189, f16[1024,768]{1,0} %broadcast.192), custom_call_target="__cublas$gemm", metadata={op_type="BiasAdd" op_name="bert/encoder/layer_1/attention/self/key/BiasAdd"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":1,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"111\"}"
2021-09-14 16:33:50.919977: I tensorflow/compiler/xla/service/executable.cc:221] 50534 cycles ( 0.29% 20Σ) :: 35.8 usec :: :: :: :: :: %custom-call.96 = f16[1024,768]{1,0} custom-call(f16[1024,768]{1,0} %fusion.54, f16[768,768]{1,0} %constant_1434), custom_call_target="__cublas$gemm", metadata={op_type="MatMul" op_name="bert/encoder/layer_8/attention/output/dense/MatMul"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"109\"}"
2021-09-14 16:33:50.919985: I tensorflow/compiler/xla/service/executable.cc:221] 50534 cycles ( 0.29% 21Σ) :: 35.8 usec :: :: :: :: :: %custom-call.83 = f16[1024,768]{1,0} custom-call(f16[1024,768]{1,0} %get-tuple-element.58, f16[768,768]{1,0} %constant_1250, f16[1024,768]{1,0} %broadcast.1253), custom_call_target="__cublas$gemm", metadata={op_type="BiasAdd" op_name="bert/encoder/layer_7/attention/self/value/BiasAdd"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":1,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"111\"}"
2021-09-14 16:33:50.919991: I tensorflow/compiler/xla/service/executable.cc:221] 50534 cycles ( 0.29% 21Σ) :: 35.8 usec :: :: :: :: :: %custom-call.89 = f16[1024,768]{1,0} custom-call(f16[1024,768]{1,0} %get-tuple-element.46, f16[768,768]{1,0} %constant_1407, f16[1024,768]{1,0} %broadcast.1410), custom_call_target="__cublas$gemm", metadata={op_type="BiasAdd" op_name="bert/encoder/layer_8/attention/self/query/BiasAdd"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":1,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"111\"}"
2021-09-14 16:33:50.919996: I tensorflow/compiler/xla/service/executable.cc:221] 50534 cycles ( 0.29% 21Σ) :: 35.8 usec :: :: :: :: :: %custom-call.56 = f16[1024,768]{1,0} custom-call(f16[1024,768]{1,0} %get-tuple-element.82, f16[768,768]{1,0} %constant_888, f16[1024,768]{1,0} %broadcast.891), custom_call_target="__cublas$gemm", metadata={op_type="BiasAdd" op_name="bert/encoder/layer_5/attention/self/query/BiasAdd"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":1,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"111\"}"
2021-09-14 16:33:50.920001: I tensorflow/compiler/xla/service/executable.cc:221] 49090 cycles ( 0.28% 21Σ) :: 34.8 usec :: :: :: :: :: %custom-call.52 = f16[1024,768]{1,0} custom-call(f16[1024,768]{1,0} %fusion.114, f16[768,768]{1,0} %constant_742), custom_call_target="__cublas$gemm", metadata={op_type="MatMul" op_name="bert/encoder/layer_4/attention/output/dense/MatMul"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"109\"}"
2021-09-14 16:33:50.920007: I tensorflow/compiler/xla/service/executable.cc:221] 49090 cycles ( 0.28% 22Σ) :: 34.8 usec :: :: :: :: :: %custom-call.58 = f16[1024,768]{1,0} custom-call(f16[1024,768]{1,0} %get-tuple-element.82, f16[768,768]{1,0} %constant_881, f16[1024,768]{1,0} %broadcast.884), custom_call_target="__cublas$gemm", metadata={op_type="BiasAdd" op_name="bert/encoder/layer_5/attention/self/key/BiasAdd"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":1,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"111\"}"
2021-09-14 16:33:50.920012: I tensorflow/compiler/xla/service/executable.cc:221] 49090 cycles ( 0.28% 22Σ) :: 34.8 usec :: :: :: :: :: %custom-call.74 = f16[1024,768]{1,0} custom-call(f16[1024,768]{1,0} %fusion.84, f16[768,768]{1,0} %constant_1088), custom_call_target="__cublas$gemm", metadata={op_type="MatMul" op_name="bert/encoder/layer_6/attention/output/dense/MatMul"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"109\"}"
2021-09-14 16:33:50.920020: I tensorflow/compiler/xla/service/executable.cc:221] 49090 cycles ( 0.28% 22Σ) :: 34.8 usec :: :: :: :: :: %custom-call.129 = f16[1024,768]{1,0} custom-call(f16[1024,768]{1,0} %fusion.9, f16[768,768]{1,0} %constant_1953), custom_call_target="__cublas$gemm", metadata={op_type="MatMul" op_name="bert/encoder/layer_11/attention/output/dense/MatMul"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"109\"}"
2021-09-14 16:33:50.920026: I tensorflow/compiler/xla/service/executable.cc:221] 49090 cycles ( 0.28% 23Σ) :: 34.8 usec :: :: :: :: :: %custom-call.85 = f16[1024,768]{1,0} custom-call(f16[1024,768]{1,0} %fusion.69, f16[768,768]{1,0} %constant_1261), custom_call_target="__cublas$gemm", metadata={op_type="MatMul" op_name="bert/encoder/layer_7/attention/output/dense/MatMul"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"109\"}"
2021-09-14 16:33:50.920031: I tensorflow/compiler/xla/service/executable.cc:221] 49090 cycles ( 0.28% 23Σ) :: 34.8 usec :: :: :: :: :: %custom-call.124 = f16[1024,768]{1,0} custom-call(f16[1024,768]{1,0} %get-tuple-element.10, f16[768,768]{1,0} %constant_1919, f16[1024,768]{1,0} %broadcast.1922), custom_call_target="__cublas$gemm", metadata={op_type="BiasAdd" op_name="bert/encoder/layer_11/attention/self/key/BiasAdd"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":1,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"111\"}"
2021-09-14 16:33:50.920036: I tensorflow/compiler/xla/service/executable.cc:221] 49090 cycles ( 0.28% 23Σ) :: 34.8 usec ( 6.1 optimal) :: 90.00GFLOP/s :: :: 253.10GiB/s :: 192B/cycle :: %fusion.222 = (f16[8,12,128]{2,1,0}, f16[8,12,128,128]{3,2,1,0}) fusion(f16[8,12,128,128]{3,2,1,0} %get-tuple-element.142, f16[8,12,128,128]{3,2,1,0} %custom-call.59), kind=kInput, calls=%fused_computation.222, metadata={op_type="Softmax" op_name="bert/encoder/layer_5/attention/self/Softmax"}
2021-09-14 16:33:50.920042: I tensorflow/compiler/xla/service/executable.cc:221] 49090 cycles ( 0.28% 23Σ) :: 34.8 usec :: :: :: :: :: %custom-call.107 = f16[1024,768]{1,0} custom-call(f16[1024,768]{1,0} %fusion.39, f16[768,768]{1,0} %constant_1607), custom_call_target="__cublas$gemm", metadata={op_type="MatMul" op_name="bert/encoder/layer_9/attention/output/dense/MatMul"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"109\"}"
2021-09-14 16:33:50.920050: I tensorflow/compiler/xla/service/executable.cc:221] 49090 cycles ( 0.28% 24Σ) :: 34.8 usec ( 6.1 optimal) :: 90.00GFLOP/s :: :: 253.10GiB/s :: 192B/cycle :: %fusion.186 = (f16[8,12,128]{2,1,0}, f16[8,12,128,128]{3,2,1,0}) fusion(f16[8,12,128,128]{3,2,1,0} %get-tuple-element.142, f16[8,12,128,128]{3,2,1,0} %custom-call.125), kind=kInput, calls=%fused_computation.186, metadata={op_type="Softmax" op_name="bert/encoder/layer_11/attention/self/Softmax"}
2021-09-14 16:33:50.920056: I tensorflow/compiler/xla/service/executable.cc:221] 49090 cycles ( 0.28% 24Σ) :: 34.8 usec :: :: :: :: :: %custom-call.47 = f16[1024,768]{1,0} custom-call(f16[1024,768]{1,0} %get-tuple-element.94, f16[768,768]{1,0} %constant_708, f16[1024,768]{1,0} %broadcast.711), custom_call_target="__cublas$gemm", metadata={op_type="BiasAdd" op_name="bert/encoder/layer_4/attention/self/key/BiasAdd"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":1,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"111\"}"
2021-09-14 16:33:50.920061: I tensorflow/compiler/xla/service/executable.cc:221] 49090 cycles ( 0.28% 24Σ) :: 34.8 usec :: :: :: :: :: %custom-call.118 = f16[1024,768]{1,0} custom-call(f16[1024,768]{1,0} %fusion.24, f16[768,768]{1,0} %constant_1780), custom_call_target="__cublas$gemm", metadata={op_type="MatMul" op_name="bert/encoder/layer_10/attention/output/dense/MatMul"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"109\"}"
2021-09-14 16:33:50.920066: I tensorflow/compiler/xla/service/executable.cc:221] 49090 cycles ( 0.28% 25Σ) :: 34.8 usec :: :: :: :: :: %custom-call.113 = f16[1024,768]{1,0} custom-call(f16[1024,768]{1,0} %get-tuple-element.22, f16[768,768]{1,0} %constant_1746, f16[1024,768]{1,0} %broadcast.1749), custom_call_target="__cublas$gemm", metadata={op_type="BiasAdd" op_name="bert/encoder/layer_10/attention/self/key/BiasAdd"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":1,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"111\"}"
2021-09-14 16:33:50.920078: I tensorflow/compiler/xla/service/executable.cc:221] 49090 cycles ( 0.28% 25Σ) :: 34.8 usec :: :: :: :: :: %custom-call.111 = f16[1024,768]{1,0} custom-call(f16[1024,768]{1,0} %get-tuple-element.22, f16[768,768]{1,0} %constant_1753, f16[1024,768]{1,0} %broadcast.1756), custom_call_target="__cublas$gemm", metadata={op_type="BiasAdd" op_name="bert/encoder/layer_10/attention/self/query/BiasAdd"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":1,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"111\"}"
2021-09-14 16:33:50.920083: I tensorflow/compiler/xla/service/executable.cc:221] 49090 cycles ( 0.28% 25Σ) :: 34.8 usec :: :: :: :: :: %custom-call.102 = f16[1024,768]{1,0} custom-call(f16[1024,768]{1,0} %get-tuple-element.34, f16[768,768]{1,0} %constant_1573, f16[1024,768]{1,0} %broadcast.1576), custom_call_target="__cublas$gemm", metadata={op_type="BiasAdd" op_name="bert/encoder/layer_9/attention/self/key/BiasAdd"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":1,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"111\"}"
2021-09-14 16:33:50.920092: I tensorflow/compiler/xla/service/executable.cc:221] 49090 cycles ( 0.28% 25Σ) :: 34.8 usec :: :: :: :: :: %custom-call.19 = f16[1024,768]{1,0} custom-call(f16[1024,768]{1,0} %fusion.159, f16[768,768]{1,0} %constant_223), custom_call_target="__cublas$gemm", metadata={op_type="MatMul" op_name="bert/encoder/layer_1/attention/output/dense/MatMul"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"109\"}"
2021-09-14 16:33:50.920098: I tensorflow/compiler/xla/service/executable.cc:221] 49090 cycles ( 0.28% 26Σ) :: 34.8 usec ( 6.1 optimal) :: 90.00GFLOP/s :: :: 253.10GiB/s :: 192B/cycle :: %fusion.246 = (f16[8,12,128]{2,1,0}, f16[8,12,128,128]{3,2,1,0}) fusion(f16[8,12,128,128]{3,2,1,0} %get-tuple-element.142, f16[8,12,128,128]{3,2,1,0} %custom-call.15), kind=kInput, calls=%fused_computation.246, metadata={op_type="Softmax" op_name="bert/encoder/layer_1/attention/self/Softmax"}
2021-09-14 16:33:50.920104: I tensorflow/compiler/xla/service/executable.cc:221] 49090 cycles ( 0.28% 26Σ) :: 34.8 usec :: :: :: :: :: %custom-call.41 = f16[1024,768]{1,0} custom-call(f16[1024,768]{1,0} %fusion.129, f16[768,768]{1,0} %constant_569), custom_call_target="__cublas$gemm", metadata={op_type="MatMul" op_name="bert/encoder/layer_3/attention/output/dense/MatMul"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"109\"}"
2021-09-14 16:33:50.920109: I tensorflow/compiler/xla/service/executable.cc:221] 49090 cycles ( 0.28% 26Σ) :: 34.8 usec ( 6.1 optimal) :: 90.00GFLOP/s :: :: 253.10GiB/s :: 192B/cycle :: %fusion.192 = (f16[8,12,128]{2,1,0}, f16[8,12,128,128]{3,2,1,0}) fusion(f16[8,12,128,128]{3,2,1,0} %get-tuple-element.142, f16[8,12,128,128]{3,2,1,0} %custom-call.114), kind=kInput, calls=%fused_computation.192, metadata={op_type="Softmax" op_name="bert/encoder/layer_10/attention/self/Softmax"}
2021-09-14 16:33:50.920114: I tensorflow/compiler/xla/service/executable.cc:221] 49090 cycles ( 0.28% 27Σ) :: 34.8 usec :: :: :: :: :: %custom-call.7 = f16[8,12,128,64]{3,2,1,0} custom-call(f16[8,12,128,128]{3,2,1,0} %fusion.176, f16[8,12,128,64]{3,2,1,0} %fusion.175), custom_call_target="__cublas$gemm", metadata={op_type="BatchMatMul" op_name="bert/encoder/layer_0/attention/self/MatMul_1"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"3\"],\"rhs_contracting_dimensions\":[\"2\"],\"lhs_batch_dimensions\":[\"0\",\"1\"],\"rhs_batch_dimensions\":[\"0\",\"1\"]},\"batch_size\":\"96\"}"
2021-09-14 16:33:50.920123: I tensorflow/compiler/xla/service/executable.cc:221] 49090 cycles ( 0.28% 27Σ) :: 34.8 usec :: :: :: :: :: %custom-call.30 = f16[1024,768]{1,0} custom-call(f16[1024,768]{1,0} %fusion.144, f16[768,768]{1,0} %constant_396), custom_call_target="__cublas$gemm", metadata={op_type="MatMul" op_name="bert/encoder/layer_2/attention/output/dense/MatMul"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"109\"}"
2021-09-14 16:33:50.920128: I tensorflow/compiler/xla/service/executable.cc:221] 49090 cycles ( 0.28% 27Σ) :: 34.8 usec :: :: :: :: :: %custom-call.25 = f16[1024,768]{1,0} custom-call(f16[1024,768]{1,0} %get-tuple-element.118, f16[768,768]{1,0} %constant_362, f16[1024,768]{1,0} %broadcast.365), custom_call_target="__cublas$gemm", metadata={op_type="BiasAdd" op_name="bert/encoder/layer_2/attention/self/key/BiasAdd"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":1,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"111\"}"
2021-09-14 16:33:50.920134: I tensorflow/compiler/xla/service/executable.cc:221] 49090 cycles ( 0.28% 27Σ) :: 34.8 usec ( 4.1 optimal) :: 90.00GFLOP/s :: 45.18GTROP/s :: 169.61GiB/s :: 129B/cycle :: %fusion.251 = (f16[8,12,128]{2,1,0}, f16[8,12,128,128]{3,2,1,0}) fusion(f16[8,12,128,128]{3,2,1,0} %get-tuple-element.141, f16[8,12,128]{2,1,0} %get-tuple-element.140), kind=kInput, calls=%fused_computation.251, metadata={op_type="Softmax" op_name="bert/encoder/layer_0/attention/self/Softmax"}
2021-09-14 16:33:50.920139: I tensorflow/compiler/xla/service/executable.cc:221] 47646 cycles ( 0.28% 28Σ) :: 33.8 usec ( 6.1 optimal) :: 92.73GFLOP/s :: :: 260.77GiB/s :: 198B/cycle :: %fusion.240 = (f16[8,12,128]{2,1,0}, f16[8,12,128,128]{3,2,1,0}) fusion(f16[8,12,128,128]{3,2,1,0} %get-tuple-element.142, f16[8,12,128,128]{3,2,1,0} %custom-call.26), kind=kInput, calls=%fused_computation.240, metadata={op_type="Softmax" op_name="bert/encoder/layer_2/attention/self/Softmax"}
2021-09-14 16:33:50.920145: I tensorflow/compiler/xla/service/executable.cc:221] 47646 cycles ( 0.28% 28Σ) :: 33.8 usec ( 6.1 optimal) :: 92.73GFLOP/s :: :: 260.77GiB/s :: 198B/cycle :: %fusion.228 = (f16[8,12,128]{2,1,0}, f16[8,12,128,128]{3,2,1,0}) fusion(f16[8,12,128,128]{3,2,1,0} %get-tuple-element.142, f16[8,12,128,128]{3,2,1,0} %custom-call.48), kind=kInput, calls=%fused_computation.228, metadata={op_type="Softmax" op_name="bert/encoder/layer_4/attention/self/Softmax"}
2021-09-14 16:33:50.920151: I tensorflow/compiler/xla/service/executable.cc:221] 47646 cycles ( 0.28% 28Σ) :: 33.8 usec :: :: :: :: :: %custom-call.91 = f16[1024,768]{1,0} custom-call(f16[1024,768]{1,0} %get-tuple-element.46, f16[768,768]{1,0} %constant_1400, f16[1024,768]{1,0} %broadcast.1403), custom_call_target="__cublas$gemm", metadata={op_type="BiasAdd" op_name="bert/encoder/layer_8/attention/self/key/BiasAdd"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":1,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"111\"}"
2021-09-14 16:33:50.920156: I tensorflow/compiler/xla/service/executable.cc:221] 47646 cycles ( 0.28% 28Σ) :: 33.8 usec ( 6.1 optimal) :: 92.73GFLOP/s :: :: 260.77GiB/s :: 198B/cycle :: %fusion.216 = (f16[8,12,128]{2,1,0}, f16[8,12,128,128]{3,2,1,0}) fusion(f16[8,12,128,128]{3,2,1,0} %get-tuple-element.142, f16[8,12,128,128]{3,2,1,0} %custom-call.70), kind=kInput, calls=%fused_computation.216, metadata={op_type="Softmax" op_name="bert/encoder/layer_6/attention/self/Softmax"}
2021-09-14 16:33:50.920165: I tensorflow/compiler/xla/service/executable.cc:221] 47646 cycles ( 0.28% 29Σ) :: 33.8 usec ( 6.1 optimal) :: 92.73GFLOP/s :: :: 260.77GiB/s :: 198B/cycle :: %fusion.204 = (f16[8,12,128]{2,1,0}, f16[8,12,128,128]{3,2,1,0}) fusion(f16[8,12,128,128]{3,2,1,0} %get-tuple-element.142, f16[8,12,128,128]{3,2,1,0} %custom-call.92), kind=kInput, calls=%fused_computation.204, metadata={op_type="Softmax" op_name="bert/encoder/layer_8/attention/self/Softmax"}
2021-09-14 16:33:50.920170: I tensorflow/compiler/xla/service/executable.cc:221] 47646 cycles ( 0.28% 29Σ) :: 33.8 usec :: :: :: :: :: %custom-call.63 = f16[1024,768]{1,0} custom-call(f16[1024,768]{1,0} %fusion.99, f16[768,768]{1,0} %constant_915), custom_call_target="__cublas$gemm", metadata={op_type="MatMul" op_name="bert/encoder/layer_5/attention/output/dense/MatMul"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"batch_size\":\"1\",\"selected_algorithm\":\"109\"}"
2021-09-14 16:33:50.920176: I tensorflow/compiler/xla/service/executable.cc:221] 46201 cycles ( 0.27% 29Σ) :: 32.8 usec ( 6.1 optimal) :: 95.63GFLOP/s :: :: 268.93GiB/s :: 204B/cycle :: %fusion.234 = (f16[8,12,128]{2,1,0}, f16[8,12,128,128]{3,2,1,0}) fusion(f16[8,12,128,128]{3,2,1,0} %get-tuple-element.142, f16[8,12,128,128]{3,2,1,0} %custom-call.37), kind=kInput, calls=%fused_computation.234, metadata={op_type="Softmax" op_name="bert/encoder/layer_3/attention/self/Softmax"}
2021-09-14 16:33:50.920181: I tensorflow/compiler/xla/service/executable.cc:221] 44757 cycles ( 0.26% 30Σ) :: 31.7 usec ( 6.1 optimal) :: 98.71GFLOP/s :: :: 277.61GiB/s :: 211B/cycle :: %fusion.198 = (f16[8,12,128]{2,1,0}, f16[8,12,128,128]{3,2,1,0}) fusion(f16[8,12,128,128]{3,2,1,0} %get-tuple-element.142, f16[8,12,128,128]{3,2,1,0} %custom-call.103), kind=kInput, calls=%fused_computation.198, metadata={op_type="Softmax" op_name="bert/encoder/layer_9/attention/self/Softmax"}
2021-09-14 16:33:50.920187: I tensorflow/compiler/xla/service/executable.cc:221] 44757 cycles ( 0.26% 30Σ) :: 31.7 usec ( 4.1 optimal) :: 98.71GFLOP/s :: 49.55GTROP/s :: 186.03GiB/s :: 141B/cycle :: %fusion.203 = (f16[8,12,128]{2,1,0}, f16[8,12,128,128]{3,2,1,0}) fusion(f16[8,12,128,128]{3,2,1,0} %get-tuple-element.45, f16[8,12,128]{2,1,0} %get-tuple-element.44), kind=kInput, calls=%fused_computation.203, metadata={op_type="Softmax" op_name="bert/encoder/layer_8/attention/self/Softmax"}
2021-09-14 16:33:50.920192: I tensorflow/compiler/xla/service/executable.cc:221] 44757 cycles ( 0.26% 30Σ) :: 31.7 usec ( 4.1 optimal) :: 98.71GFLOP/s :: 49.55GTROP/s :: 186.03GiB/s :: 141B/cycle :: %fusion.245 = (f16[8,12,128]{2,1,0}, f16[8,12,128,128]{3,2,1,0}) fusion(f16[8,12,128,128]{3,2,1,0} %get-tuple-element.129, f16[8,12,128]{2,1,0} %get-tuple-element.128), kind=kInput, calls=%fused_computation.245, metadata={op_type="Softmax" op_name="bert/encoder/layer_1/attention/self/Softmax"}
2021-09-14 16:33:50.920198: I tensorflow/compiler/xla/service/executable.cc:221] 43313 cycles ( 0.25% 30Σ) :: 30.7 usec ( 4.1 optimal) :: 102.00GFLOP/s :: 51.20GTROP/s :: 192.23GiB/s :: 146B/cycle :: %fusion.209 = (f16[8,12,128]{2,1,0}, f16[8,12,128,128]{3,2,1,0}) fusion(f16[8,12,128,128]{3,2,1,0} %get-tuple-element.57, f16[8,12,128]{2,1,0} %get-tuple-element.56), kind=kInput, calls=%fused_computation.209, metadata={op_type="Softmax" op_name="bert/encoder/layer_7/attention/self/Softmax"}
2021-09-14 16:33:50.920206: I tensorflow/compiler/xla/service/executable.cc:221] 43313 cycles ( 0.25% 31Σ) :: 30.7 usec ( 6.1 optimal) :: 102.00GFLOP/s :: :: 286.86GiB/s :: 218B/cycle :: %fusion.210 = (f16[8,12,128]{2,1,0}, f16[8,12,128,128]{3,2,1,0}) fusion(f16[8,12,128,128]{3,2,1,0} %get-tuple-element.142, f16[8,12,128,128]{3,2,1,0} %custom-call.81), kind=kInput, calls=%fused_computation.210, metadata={op_type="Softmax" op_name="bert/encoder/layer_7/attention/self/Softmax"}
2021-09-14 16:33:50.920211: I tensorflow/compiler/xla/service/executable.cc:221] 43313 cycles ( 0.25% 31Σ) :: 30.7 usec ( 4.1 optimal) :: 102.00GFLOP/s :: 51.20GTROP/s :: 192.23GiB/s :: 146B/cycle :: %fusion.185 = (f16[8,12,128]{2,1,0}, f16[8,12,128,128]{3,2,1,0}) fusion(f16[8,12,128,128]{3,2,1,0} %get-tuple-element.9, f16[8,12,128]{2,1,0} %get-tuple-element.8), kind=kInput, calls=%fused_computation.185, metadata={op_type="Softmax" op_name="bert/encoder/layer_11/attention/self/Softmax"}
2021-09-14 16:33:50.920217: I tensorflow/compiler/xla/service/executable.cc:221] 41871 cycles ( 0.24% 31Σ) :: 29.7 usec ( 4.1 optimal) :: 105.52GFLOP/s :: 52.97GTROP/s :: 198.85GiB/s :: 151B/cycle :: %fusion.227 = (f16[8,12,128]{2,1,0}, f16[8,12,128,128]{3,2,1,0}) fusion(f16[8,12,128,128]{3,2,1,0} %get-tuple-element.93, f16[8,12,128]{2,1,0} %get-tuple-element.92), kind=kInput, calls=%fused_computation.227, metadata={op_type="Softmax" op_name="bert/encoder/layer_4/attention/self/Softmax"}
2021-09-14 16:33:50.920222: I tensorflow/compiler/xla/service/executable.cc:221] 41871 cycles ( 0.24% 31Σ) :: 29.7 usec ( 4.1 optimal) :: 105.52GFLOP/s :: 52.97GTROP/s :: 198.85GiB/s :: 151B/cycle :: %fusion.221 = (f16[8,12,128]{2,1,0}, f16[8,12,128,128]{3,2,1,0}) fusion(f16[8,12,128,128]{3,2,1,0} %get-tuple-element.81, f16[8,12,128]{2,1,0} %get-tuple-element.80), kind=kInput, calls=%fused_computation.221, metadata={op_type="Softmax" op_name="bert/encoder/layer_5/attention/self/Softmax"}
2021-09-14 16:33:50.920227: I tensorflow/compiler/xla/service/executable.cc:221] 41871 cycles ( 0.24% 32Σ) :: 29.7 usec ( 4.1 optimal) :: 105.52GFLOP/s :: 52.97GTROP/s :: 198.85GiB/s :: 151B/cycle :: %fusion.197 = (f16[8,12,128]{2,1,0}, f16[8,12,128,128]{3,2,1,0}) fusion(f16[8,12,128,128]{3,2,1,0} %get-tuple-element.33, f16[8,12,128]{2,1,0} %get-tuple-element.32), kind=kInput, calls=%fused_computation.197, metadata={op_type="Softmax" op_name="bert/encoder/layer_9/attention/self/Softmax"}
2021-09-14 16:33:50.920232: I tensorflow/compiler/xla/service/executable.cc:221] 40427 cycles ( 0.23% 32Σ) :: 28.7 usec ( 4.1 optimal) :: 109.29GFLOP/s :: 54.86GTROP/s :: 205.96GiB/s :: 156B/cycle :: %fusion.239 = (f16[8,12,128]{2,1,0}, f16[8,12,128,128]{3,2,1,0}) fusion(f16[8,12,128,128]{3,2,1,0} %get-tuple-element.117, f16[8,12,128]{2,1,0} %get-tuple-element.116), kind=kInput, calls=%fused_computation.239, metadata={op_type="Softmax" op_name="bert/encoder/layer_2/attention/self/Softmax"}
2021-09-14 16:33:50.920237: I tensorflow/compiler/xla/service/executable.cc:221] 40427 cycles ( 0.23% 32Σ) :: 28.7 usec ( 4.1 optimal) :: 109.29GFLOP/s :: 54.86GTROP/s :: 205.96GiB/s :: 156B/cycle :: %fusion.215 = (f16[8,12,128]{2,1,0}, f16[8,12,128,128]{3,2,1,0}) fusion(f16[8,12,128,128]{3,2,1,0} %get-tuple-element.69, f16[8,12,128]{2,1,0} %get-tuple-element.68), kind=kInput, calls=%fused_computation.215, metadata={op_type="Softmax" op_name="bert/encoder/layer_6/attention/self/Softmax"}
2021-09-14 16:33:50.920246: I tensorflow/compiler/xla/service/executable.cc:221] 40427 cycles ( 0.23% 32Σ) :: 28.7 usec ( 4.1 optimal) :: 109.29GFLOP/s :: 54.86GTROP/s :: 205.96GiB/s :: 156B/cycle :: %fusion.191 = (f16[8,12,128]{2,1,0}, f16[8,12,128,128]{3,2,1,0}) fusion(f16[8,12,128,128]{3,2,1,0} %get-tuple-element.21, f16[8,12,128]{2,1,0} %get-tuple-element.20), kind=kInput, calls=%fused_computation.191, metadata={op_type="Softmax" op_name="bert/encoder/layer_10/attention/self/Softmax"}
2021-09-14 16:33:50.920251: I tensorflow/compiler/xla/service/executable.cc:221] 38983 cycles ( 0.23% 32Σ) :: 27.6 usec ( 4.1 optimal) :: 113.33GFLOP/s :: 56.89GTROP/s :: 213.59GiB/s :: 162B/cycle :: %fusion.233 = (f16[8,12,128]{2,1,0}, f16[8,12,128,128]{3,2,1,0}) fusion(f16[8,12,128,128]{3,2,1,0} %get-tuple-element.105, f16[8,12,128]{2,1,0} %get-tuple-element.104), kind=kInput, calls=%fused_computation.233, metadata={op_type="Softmax" op_name="bert/encoder/layer_3/attention/self/Softmax"}
2021-09-14 16:33:50.920256: I tensorflow/compiler/xla/service/executable.cc:221] 37538 cycles ( 0.22% 33Σ) :: 26.6 usec ( 8.1 optimal) :: 1.30TFLOP/s :: 118.16GTROP/s :: 440.61GiB/s :: 335B/cycle :: %fusion.124 = f16[1024,3072]{1,0} fusion(f32[3072]{0} %constant_629, f16[1024,3072]{1,0} %custom-call.42), kind=kLoop, calls=%fused_computation.124, metadata={op_type="Cast" op_name="bert/encoder/layer_3/intermediate/dense/mul_3-0-CastToFp16-AutoMixedPrecision"}
2021-09-14 16:33:50.920262: I tensorflow/compiler/xla/service/executable.cc:221] 37538 cycles ( 0.22% 33Σ) :: 26.6 usec ( 8.1 optimal) :: 1.30TFLOP/s :: 118.16GTROP/s :: 440.61GiB/s :: 335B/cycle :: %fusion.154 = f16[1024,3072]{1,0} fusion(f32[3072]{0} %constant_283, f16[1024,3072]{1,0} %custom-call.20), kind=kLoop, calls=%fused_computation.154, metadata={op_type="Cast" op_name="bert/encoder/layer_1/intermediate/dense/mul_3-0-CastToFp16-AutoMixedPrecision"}
2021-09-14 16:33:50.920268: I tensorflow/compiler/xla/service/executable.cc:221] 37538 cycles ( 0.22% 33Σ) :: 26.6 usec ( 8.1 optimal) :: 1.30TFLOP/s :: 118.16GTROP/s :: 440.61GiB/s :: 335B/cycle :: %fusion.79 = f16[1024,3072]{1,0} fusion(f32[3072]{0} %constant_1148, f16[1024,3072]{1,0} %custom-call.75), kind=kLoop, calls=%fused_computation.79, metadata={op_type="Cast" op_name="bert/encoder/layer_6/intermediate/dense/mul_3-0-CastToFp16-AutoMixedPrecision"}
2021-09-14 16:33:50.920273: I tensorflow/compiler/xla/service/executable.cc:221] 37538 cycles ( 0.22% 33Σ) :: 26.6 usec ( 8.1 optimal) :: 1.30TFLOP/s :: 118.16GTROP/s :: 440.61GiB/s :: 335B/cycle :: %fusion.139 = f16[1024,3072]{1,0} fusion(f32[3072]{0} %constant_456, f16[1024,3072]{1,0} %custom-call.31), kind=kLoop, calls=%fused_computation.139, metadata={op_type="Cast" op_name="bert/encoder/layer_2/intermediate/dense/mul_3-0-CastToFp16-AutoMixedPrecision"}
2021-09-14 16:33:50.920278: I tensorflow/compiler/xla/service/executable.cc:221] 37538 cycles ( 0.22% 34Σ) :: 26.6 usec ( 8.1 optimal) :: 1.30TFLOP/s :: 118.16GTROP/s :: 440.61GiB/s :: 335B/cycle :: %fusion.64 = f16[1024,3072]{1,0} fusion(f32[3072]{0} %constant_1321, f16[1024,3072]{1,0} %custom-call.86), kind=kLoop, calls=%fused_computation.64, metadata={op_type="Cast" op_name="bert/encoder/layer_7/intermediate/dense/mul_3-0-CastToFp16-AutoMixedPrecision"}
2021-09-14 16:33:50.920283: I tensorflow/compiler/xla/service/executable.cc:221] 37538 cycles ( 0.22% 34Σ) :: 26.6 usec ( 8.1 optimal) :: 1.30TFLOP/s :: 118.16GTROP/s :: 440.61GiB/s :: 335B/cycle :: %fusion.49 = f16[1024,3072]{1,0} fusion(f32[3072]{0} %constant_1494, f16[1024,3072]{1,0} %custom-call.97), kind=kLoop, calls=%fused_computation.49, metadata={op_type="Cast" op_name="bert/encoder/layer_8/intermediate/dense/mul_3-0-CastToFp16-AutoMixedPrecision"}
2021-09-14 16:33:50.920291: I tensorflow/compiler/xla/service/executable.cc:221] 37538 cycles ( 0.22% 34Σ) :: 26.6 usec ( 8.1 optimal) :: 1.30TFLOP/s :: 118.16GTROP/s :: 440.61GiB/s :: 335B/cycle :: %fusion.19 = f16[1024,3072]{1,0} fusion(f32[3072]{0} %constant_1840, f16[1024,3072]{1,0} %custom-call.119), kind=kLoop, calls=%fused_computation.19, metadata={op_type="Cast" op_name="bert/encoder/layer_10/intermediate/dense/mul_3-0-CastToFp16-AutoMixedPrecision"}
2021-09-14 16:33:50.920297: I tensorflow/compiler/xla/service/executable.cc:221] 36094 cycles ( 0.21% 34Σ) :: 25.6 usec :: :: :: :: :: %custom-call.15 = f16[8,12,128,128]{3,2,1,0} custom-call(f16[8,12,128,64]{3,2,1,0} %fusion.164, f16[8,12,128,64]{3,2,1,0} %fusion.163), custom_call_target="__cublas$gemm", metadata={op_type="BatchMatMul" op_name="bert/encoder/layer_1/attention/self/MatMul"}, backend_config="{\"alpha_real\":0.125,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"3\"],\"rhs_contracting_dimensions\":[\"3\"],\"lhs_batch_dimensions\":[\"0\",\"1\"],\"rhs_batch_dimensions\":[\"0\",\"1\"]},\"batch_size\":\"96\"}"
2021-09-14 16:33:50.920302: I tensorflow/compiler/xla/service/executable.cc:221] 36094 cycles ( 0.21% 34Σ) :: 25.6 usec ( 8.1 optimal) :: 1.35TFLOP/s :: 122.89GTROP/s :: 458.24GiB/s :: 348B/cycle :: %fusion.109 = f16[1024,3072]{1,0} fusion(f32[3072]{0} %constant_802, f16[1024,3072]{1,0} %custom-call.53), kind=kLoop, calls=%fused_computation.109, metadata={op_type="Cast" op_name="bert/encoder/layer_4/intermediate/dense/mul_3-0-CastToFp16-AutoMixedPrecision"}
2021-09-14 16:33:50.920307: I tensorflow/compiler/xla/service/executable.cc:221] 36094 cycles ( 0.21% 35Σ) :: 25.6 usec ( 8.1 optimal) :: 1.35TFLOP/s :: 122.89GTROP/s :: 458.24GiB/s :: 348B/cycle :: %fusion.169 = f16[1024,3072]{1,0} fusion(f32[3072]{0} %constant_110, f16[1024,3072]{1,0} %custom-call.9), kind=kLoop, calls=%fused_computation.169, metadata={op_type="Cast" op_name="bert/encoder/layer_0/intermediate/dense/mul_3-0-CastToFp16-AutoMixedPrecision"}
2021-09-14 16:33:50.920312: I tensorflow/compiler/xla/service/executable.cc:221] 36094 cycles ( 0.21% 35Σ) :: 25.6 usec :: :: :: :: :: %custom-call.59 = f16[8,12,128,128]{3,2,1,0} custom-call(f16[8,12,128,64]{3,2,1,0} %fusion.104, f16[8,12,128,64]{3,2,1,0} %fusion.103), custom_call_target="__cublas$gemm", metadata={op_type="BatchMatMul" op_name="bert/encoder/layer_5/attention/self/MatMul"}, backend_config="{\"alpha_real\":0.125,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"3\"],\"rhs_contracting_dimensions\":[\"3\"],\"lhs_batch_dimensions\":[\"0\",\"1\"],\"rhs_batch_dimensions\":[\"0\",\"1\"]},\"batch_size\":\"96\"}"
2021-09-14 16:33:50.920318: I tensorflow/compiler/xla/service/executable.cc:221] 36094 cycles ( 0.21% 35Σ) :: 25.6 usec ( 8.1 optimal) :: 1.35TFLOP/s :: 122.89GTROP/s :: 458.24GiB/s :: 348B/cycle :: %fusion.34 = f16[1024,3072]{1,0} fusion(f32[3072]{0} %constant_1667, f16[1024,3072]{1,0} %custom-call.108), kind=kLoop, calls=%fused_computation.34, metadata={op_type="Cast" op_name="bert/encoder/layer_9/intermediate/dense/mul_3-0-CastToFp16-AutoMixedPrecision"}
2021-09-14 16:33:50.920323: I tensorflow/compiler/xla/service/executable.cc:221] 36094 cycles ( 0.21% 35Σ) :: 25.6 usec ( 8.1 optimal) :: 1.35TFLOP/s :: 122.89GTROP/s :: 458.24GiB/s :: 348B/cycle :: %fusion.94 = f16[1024,3072]{1,0} fusion(f32[3072]{0} %constant_975, f16[1024,3072]{1,0} %custom-call.64), kind=kLoop, calls=%fused_computation.94, metadata={op_type="Cast" op_name="bert/encoder/layer_5/intermediate/dense/mul_3-0-CastToFp16-AutoMixedPrecision"}
2021-09-14 16:33:50.920341: I tensorflow/compiler/xla/service/executable.cc:221] 36094 cycles ( 0.21% 35Σ) :: 25.6 usec :: :: :: :: :: %custom-call.117 = f16[8,12,128,64]{3,2,1,0} custom-call(f16[8,12,128,128]{3,2,1,0} %fusion.26, f16[8,12,128,64]{3,2,1,0} %fusion.25), custom_call_target="__cublas$gemm", metadata={op_type="BatchMatMul" op_name="bert/encoder/layer_10/attention/self/MatMul_1"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"3\"],\"rhs_contracting_dimensions\":[\"2\"],\"lhs_batch_dimensions\":[\"0\",\"1\"],\"rhs_batch_dimensions\":[\"0\",\"1\"]},\"batch_size\":\"96\"}"
2021-09-14 16:33:50.920351: I tensorflow/compiler/xla/service/executable.cc:221] 36094 cycles ( 0.21% 36Σ) :: 25.6 usec ( 8.1 optimal) :: 1.35TFLOP/s :: 122.89GTROP/s :: 458.24GiB/s :: 348B/cycle :: %fusion.4 = f16[1024,3072]{1,0} fusion(f32[3072]{0} %constant_2013, f16[1024,3072]{1,0} %custom-call.130), kind=kLoop, calls=%fused_computation.4, metadata={op_type="Cast" op_name="bert/encoder/layer_11/intermediate/dense/mul_3-0-CastToFp16-AutoMixedPrecision"}
2021-09-14 16:33:50.920361: I tensorflow/compiler/xla/service/executable.cc:221] 34652 cycles ( 0.20% 36Σ) :: 24.6 usec :: :: :: :: :: %custom-call.40 = f16[8,12,128,64]{3,2,1,0} custom-call(f16[8,12,128,128]{3,2,1,0} %fusion.131, f16[8,12,128,64]{3,2,1,0} %fusion.130), custom_call_target="__cublas$gemm", metadata={op_type="BatchMatMul" op_name="bert/encoder/layer_3/attention/self/MatMul_1"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"3\"],\"rhs_contracting_dimensions\":[\"2\"],\"lhs_batch_dimensions\":[\"0\",\"1\"],\"rhs_batch_dimensions\":[\"0\",\"1\"]},\"batch_size\":\"96\"}"
2021-09-14 16:33:50.920378: I tensorflow/compiler/xla/service/executable.cc:221] 34652 cycles ( 0.20% 36Σ) :: 24.6 usec ( 3.0 optimal) :: 32.00GFLOP/s :: :: 178.81GiB/s :: 136B/cycle :: %fusion.180 = f16[1024,768]{1,0} fusion(f32[8,128,768]{2,1,0} %arg1.2), kind=kLoop, calls=%fused_computation.180, metadata={op_type="Reshape" op_name="bert/encoder/Reshape"}
2021-09-14 16:33:50.920388: I tensorflow/compiler/xla/service/executable.cc:221] 34652 cycles ( 0.20% 36Σ) :: 24.6 usec :: :: :: :: :: %custom-call.125 = f16[8,12,128,128]{3,2,1,0} custom-call(f16[8,12,128,64]{3,2,1,0} %fusion.14, f16[8,12,128,64]{3,2,1,0} %fusion.13), custom_call_target="__cublas$gemm", metadata={op_type="BatchMatMul" op_name="bert/encoder/layer_11/attention/self/MatMul"}, backend_config="{\"alpha_real\":0.125,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"3\"],\"rhs_contracting_dimensions\":[\"3\"],\"lhs_batch_dimensions\":[\"0\",\"1\"],\"rhs_batch_dimensions\":[\"0\",\"1\"]},\"batch_size\":\"96\"}"
2021-09-14 16:33:50.920397: I tensorflow/compiler/xla/service/executable.cc:221] 34652 cycles ( 0.20% 36Σ) :: 24.6 usec :: :: :: :: :: %custom-call.128 = f16[8,12,128,64]{3,2,1,0} custom-call(f16[8,12,128,128]{3,2,1,0} %fusion.11, f16[8,12,128,64]{3,2,1,0} %fusion.10), custom_call_target="__cublas$gemm", metadata={op_type="BatchMatMul" op_name="bert/encoder/layer_11/attention/self/MatMul_1"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"3\"],\"rhs_contracting_dimensions\":[\"2\"],\"lhs_batch_dimensions\":[\"0\",\"1\"],\"rhs_batch_dimensions\":[\"0\",\"1\"]},\"batch_size\":\"96\"}"
2021-09-14 16:33:50.920406: I tensorflow/compiler/xla/service/executable.cc:221] 34652 cycles ( 0.20% 37Σ) :: 24.6 usec :: :: :: :: :: %custom-call.48 = f16[8,12,128,128]{3,2,1,0} custom-call(f16[8,12,128,64]{3,2,1,0} %fusion.119, f16[8,12,128,64]{3,2,1,0} %fusion.118), custom_call_target="__cublas$gemm", metadata={op_type="BatchMatMul" op_name="bert/encoder/layer_4/attention/self/MatMul"}, backend_config="{\"alpha_real\":0.125,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"3\"],\"rhs_contracting_dimensions\":[\"3\"],\"lhs_batch_dimensions\":[\"0\",\"1\"],\"rhs_batch_dimensions\":[\"0\",\"1\"]},\"batch_size\":\"96\"}"
2021-09-14 16:33:50.920412: I tensorflow/compiler/xla/service/executable.cc:221] 34652 cycles ( 0.20% 37Σ) :: 24.6 usec :: :: :: :: :: %custom-call.114 = f16[8,12,128,128]{3,2,1,0} custom-call(f16[8,12,128,64]{3,2,1,0} %fusion.29, f16[8,12,128,64]{3,2,1,0} %fusion.28), custom_call_target="__cublas$gemm", metadata={op_type="BatchMatMul" op_name="bert/encoder/layer_10/attention/self/MatMul"}, backend_config="{\"alpha_real\":0.125,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"3\"],\"rhs_contracting_dimensions\":[\"3\"],\"lhs_batch_dimensions\":[\"0\",\"1\"],\"rhs_batch_dimensions\":[\"0\",\"1\"]},\"batch_size\":\"96\"}"
2021-09-14 16:33:50.920418: I tensorflow/compiler/xla/service/executable.cc:221] 34652 cycles ( 0.20% 37Σ) :: 24.6 usec :: :: :: :: :: %custom-call.81 = f16[8,12,128,128]{3,2,1,0} custom-call(f16[8,12,128,64]{3,2,1,0} %fusion.74, f16[8,12,128,64]{3,2,1,0} %fusion.73), custom_call_target="__cublas$gemm", metadata={op_type="BatchMatMul" op_name="bert/encoder/layer_7/attention/self/MatMul"}, backend_config="{\"alpha_real\":0.125,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"3\"],\"rhs_contracting_dimensions\":[\"3\"],\"lhs_batch_dimensions\":[\"0\",\"1\"],\"rhs_batch_dimensions\":[\"0\",\"1\"]},\"batch_size\":\"96\"}"
2021-09-14 16:33:50.920424: I tensorflow/compiler/xla/service/executable.cc:221] 34652 cycles ( 0.20% 37Σ) :: 24.6 usec :: :: :: :: :: %custom-call.84 = f16[8,12,128,64]{3,2,1,0} custom-call(f16[8,12,128,128]{3,2,1,0} %fusion.71, f16[8,12,128,64]{3,2,1,0} %fusion.70), custom_call_target="__cublas$gemm", metadata={op_type="BatchMatMul" op_name="bert/encoder/layer_7/attention/self/MatMul_1"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"3\"],\"rhs_contracting_dimensions\":[\"2\"],\"lhs_batch_dimensions\":[\"0\",\"1\"],\"rhs_batch_dimensions\":[\"0\",\"1\"]},\"batch_size\":\"96\"}"
2021-09-14 16:33:50.920429: I tensorflow/compiler/xla/service/executable.cc:221] 34652 cycles ( 0.20% 37Σ) :: 24.6 usec :: :: :: :: :: %custom-call.29 = f16[8,12,128,64]{3,2,1,0} custom-call(f16[8,12,128,128]{3,2,1,0} %fusion.146, f16[8,12,128,64]{3,2,1,0} %fusion.145), custom_call_target="__cublas$gemm", metadata={op_type="BatchMatMul" op_name="bert/encoder/layer_2/attention/self/MatMul_1"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"3\"],\"rhs_contracting_dimensions\":[\"2\"],\"lhs_batch_dimensions\":[\"0\",\"1\"],\"rhs_batch_dimensions\":[\"0\",\"1\"]},\"batch_size\":\"96\"}"
2021-09-14 16:33:50.920435: I tensorflow/compiler/xla/service/executable.cc:221] 34652 cycles ( 0.20% 38Σ) :: 24.6 usec :: :: :: :: :: %custom-call.26 = f16[8,12,128,128]{3,2,1,0} custom-call(f16[8,12,128,64]{3,2,1,0} %fusion.149, f16[8,12,128,64]{3,2,1,0} %fusion.148), custom_call_target="__cublas$gemm", metadata={op_type="BatchMatMul" op_name="bert/encoder/layer_2/attention/self/MatMul"}, backend_config="{\"alpha_real\":0.125,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"3\"],\"rhs_contracting_dimensions\":[\"3\"],\"lhs_batch_dimensions\":[\"0\",\"1\"],\"rhs_batch_dimensions\":[\"0\",\"1\"]},\"batch_size\":\"96\"}"
2021-09-14 16:33:50.920444: I tensorflow/compiler/xla/service/executable.cc:221] 34652 cycles ( 0.20% 38Σ) :: 24.6 usec :: :: :: :: :: %custom-call.18 = f16[8,12,128,64]{3,2,1,0} custom-call(f16[8,12,128,128]{3,2,1,0} %fusion.161, f16[8,12,128,64]{3,2,1,0} %fusion.160), custom_call_target="__cublas$gemm", metadata={op_type="BatchMatMul" op_name="bert/encoder/layer_1/attention/self/MatMul_1"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"3\"],\"rhs_contracting_dimensions\":[\"2\"],\"lhs_batch_dimensions\":[\"0\",\"1\"],\"rhs_batch_dimensions\":[\"0\",\"1\"]},\"batch_size\":\"96\"}"
2021-09-14 16:33:50.920449: I tensorflow/compiler/xla/service/executable.cc:221] 34652 cycles ( 0.20% 38Σ) :: 24.6 usec :: :: :: :: :: %custom-call.103 = f16[8,12,128,128]{3,2,1,0} custom-call(f16[8,12,128,64]{3,2,1,0} %fusion.44, f16[8,12,128,64]{3,2,1,0} %fusion.43), custom_call_target="__cublas$gemm", metadata={op_type="BatchMatMul" op_name="bert/encoder/layer_9/attention/self/MatMul"}, backend_config="{\"alpha_real\":0.125,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"3\"],\"rhs_contracting_dimensions\":[\"3\"],\"lhs_batch_dimensions\":[\"0\",\"1\"],\"rhs_batch_dimensions\":[\"0\",\"1\"]},\"batch_size\":\"96\"}"
2021-09-14 16:33:50.920455: I tensorflow/compiler/xla/service/executable.cc:221] 34652 cycles ( 0.20% 38Σ) :: 24.6 usec :: :: :: :: :: %custom-call.92 = f16[8,12,128,128]{3,2,1,0} custom-call(f16[8,12,128,64]{3,2,1,0} %fusion.59, f16[8,12,128,64]{3,2,1,0} %fusion.58), custom_call_target="__cublas$gemm", metadata={op_type="BatchMatMul" op_name="bert/encoder/layer_8/attention/self/MatMul"}, backend_config="{\"alpha_real\":0.125,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"3\"],\"rhs_contracting_dimensions\":[\"3\"],\"lhs_batch_dimensions\":[\"0\",\"1\"],\"rhs_batch_dimensions\":[\"0\",\"1\"]},\"batch_size\":\"96\"}"
2021-09-14 16:33:50.920460: I tensorflow/compiler/xla/service/executable.cc:221] 34652 cycles ( 0.20% 38Σ) :: 24.6 usec :: :: :: :: :: %custom-call.95 = f16[8,12,128,64]{3,2,1,0} custom-call(f16[8,12,128,128]{3,2,1,0} %fusion.56, f16[8,12,128,64]{3,2,1,0} %fusion.55), custom_call_target="__cublas$gemm", metadata={op_type="BatchMatMul" op_name="bert/encoder/layer_8/attention/self/MatMul_1"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"3\"],\"rhs_contracting_dimensions\":[\"2\"],\"lhs_batch_dimensions\":[\"0\",\"1\"],\"rhs_batch_dimensions\":[\"0\",\"1\"]},\"batch_size\":\"96\"}"
2021-09-14 16:33:50.920466: I tensorflow/compiler/xla/service/executable.cc:221] 33208 cycles ( 0.19% 39Σ) :: 23.6 usec :: :: :: :: :: %custom-call.106 = f16[8,12,128,64]{3,2,1,0} custom-call(f16[8,12,128,128]{3,2,1,0} %fusion.41, f16[8,12,128,64]{3,2,1,0} %fusion.40), custom_call_target="__cublas$gemm", metadata={op_type="BatchMatMul" op_name="bert/encoder/layer_9/attention/self/MatMul_1"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"3\"],\"rhs_contracting_dimensions\":[\"2\"],\"lhs_batch_dimensions\":[\"0\",\"1\"],\"rhs_batch_dimensions\":[\"0\",\"1\"]},\"batch_size\":\"96\"}"
2021-09-14 16:33:50.920474: I tensorflow/compiler/xla/service/executable.cc:221] 33208 cycles ( 0.19% 39Σ) :: 23.6 usec :: :: :: :: :: %custom-call.62 = f16[8,12,128,64]{3,2,1,0} custom-call(f16[8,12,128,128]{3,2,1,0} %fusion.101, f16[8,12,128,64]{3,2,1,0} %fusion.100), custom_call_target="__cublas$gemm", metadata={op_type="BatchMatMul" op_name="bert/encoder/layer_5/attention/self/MatMul_1"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"3\"],\"rhs_contracting_dimensions\":[\"2\"],\"lhs_batch_dimensions\":[\"0\",\"1\"],\"rhs_batch_dimensions\":[\"0\",\"1\"]},\"batch_size\":\"96\"}"
2021-09-14 16:33:50.920480: I tensorflow/compiler/xla/service/executable.cc:221] 33208 cycles ( 0.19% 39Σ) :: 23.6 usec :: :: :: :: :: %custom-call.51 = f16[8,12,128,64]{3,2,1,0} custom-call(f16[8,12,128,128]{3,2,1,0} %fusion.116, f16[8,12,128,64]{3,2,1,0} %fusion.115), custom_call_target="__cublas$gemm", metadata={op_type="BatchMatMul" op_name="bert/encoder/layer_4/attention/self/MatMul_1"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"3\"],\"rhs_contracting_dimensions\":[\"2\"],\"lhs_batch_dimensions\":[\"0\",\"1\"],\"rhs_batch_dimensions\":[\"0\",\"1\"]},\"batch_size\":\"96\"}"
2021-09-14 16:33:50.920485: I tensorflow/compiler/xla/service/executable.cc:221] 33208 cycles ( 0.19% 39Σ) :: 23.6 usec :: :: :: :: :: %custom-call.70 = f16[8,12,128,128]{3,2,1,0} custom-call(f16[8,12,128,64]{3,2,1,0} %fusion.89, f16[8,12,128,64]{3,2,1,0} %fusion.88), custom_call_target="__cublas$gemm", metadata={op_type="BatchMatMul" op_name="bert/encoder/layer_6/attention/self/MatMul"}, backend_config="{\"alpha_real\":0.125,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"3\"],\"rhs_contracting_dimensions\":[\"3\"],\"lhs_batch_dimensions\":[\"0\",\"1\"],\"rhs_batch_dimensions\":[\"0\",\"1\"]},\"batch_size\":\"96\"}"
2021-09-14 16:33:50.920490: I tensorflow/compiler/xla/service/executable.cc:221] 33208 cycles ( 0.19% 39Σ) :: 23.6 usec :: :: :: :: :: %custom-call.37 = f16[8,12,128,128]{3,2,1,0} custom-call(f16[8,12,128,64]{3,2,1,0} %fusion.134, f16[8,12,128,64]{3,2,1,0} %fusion.133), custom_call_target="__cublas$gemm", metadata={op_type="BatchMatMul" op_name="bert/encoder/layer_3/attention/self/MatMul"}, backend_config="{\"alpha_real\":0.125,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"3\"],\"rhs_contracting_dimensions\":[\"3\"],\"lhs_batch_dimensions\":[\"0\",\"1\"],\"rhs_batch_dimensions\":[\"0\",\"1\"]},\"batch_size\":\"96\"}"
2021-09-14 16:33:50.920496: I tensorflow/compiler/xla/service/executable.cc:221] 33208 cycles ( 0.19% 40Σ) :: 23.6 usec :: :: :: :: :: %custom-call.73 = f16[8,12,128,64]{3,2,1,0} custom-call(f16[8,12,128,128]{3,2,1,0} %fusion.86, f16[8,12,128,64]{3,2,1,0} %fusion.85), custom_call_target="__cublas$gemm", metadata={op_type="BatchMatMul" op_name="bert/encoder/layer_6/attention/self/MatMul_1"}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"3\"],\"rhs_contracting_dimensions\":[\"2\"],\"lhs_batch_dimensions\":[\"0\",\"1\"],\"rhs_batch_dimensions\":[\"0\",\"1\"]},\"batch_size\":\"96\"}"
2021-09-14 16:33:50.920504: I tensorflow/compiler/xla/service/executable.cc:221] 23100 cycles ( 0.13% 40Σ) :: 16.4 usec ( 2.0 optimal) :: 288.20GFLOP/s :: 62.50MTROP/s :: 179.64GiB/s :: 136B/cycle :: %fusion = f32[] fusion(f32[768]{0} %constant_2087, f32[1024,768]{1,0} %get-tuple-element.1, f32[768]{0} %constant_2078, f32[1024]{0} %fusion.1, f32[1024]{0} %get-tuple-element), kind=kInput, calls=%fused_computation, metadata={op_type="Mean" op_name="Mean"}
2021-09-14 16:33:50.920510: I tensorflow/compiler/xla/service/executable.cc:221] 23100 cycles ( 0.13% 40Σ) :: 16.4 usec ( 2.0 optimal) :: :: :: 178.82GiB/s :: 136B/cycle :: %fusion.179 = f16[8,12,128,64]{3,2,1,0} fusion(f16[1024,768]{1,0} %custom-call.1), kind=kLoop, calls=%fused_computation.179, metadata={op_type="Transpose" op_name="bert/encoder/layer_0/attention/self/transpose"}
2021-09-14 16:33:50.920515: I tensorflow/compiler/xla/service/executable.cc:221] 23100 cycles ( 0.13% 40Σ) :: 16.4 usec ( 5.1 optimal) :: 191.95GFLOP/s :: :: 447.47GiB/s :: 340B/cycle :: %fusion.212 = (f32[1024]{0}, f32[1024,768]{1,0}) fusion(f32[1024,768]{1,0} %get-tuple-element.63, f32[768]{0} %constant_1173, f16[1024,768]{1,0} %custom-call.76), kind=kInput, calls=%fused_computation.212, metadata={op_type="Mean" op_name="bert/encoder/layer_6/output/LayerNorm/moments/mean"}
2021-09-14 16:33:50.920520: I tensorflow/compiler/xla/service/executable.cc:221] 21656 cycles ( 0.13% 40Σ) :: 15.4 usec ( 5.1 optimal) :: 204.75GFLOP/s :: :: 477.31GiB/s :: 363B/cycle :: %fusion.236 = (f32[1024]{0}, f32[1024,768]{1,0}) fusion(f32[1024,768]{1,0} %get-tuple-element.111, f32[768]{0} %constant_481, f16[1024,768]{1,0} %custom-call.32), kind=kInput, calls=%fused_computation.236, metadata={op_type="Mean" op_name="bert/encoder/layer_2/output/LayerNorm/moments/mean"}
2021-09-14 16:33:50.920526: I tensorflow/compiler/xla/service/executable.cc:221] 21656 cycles ( 0.13% 40Σ) :: 15.4 usec ( 5.1 optimal) :: 204.75GFLOP/s :: :: 477.31GiB/s :: 363B/cycle :: %fusion.190 = (f32[1024]{0}, f32[1024,768]{1,0}) fusion(f32[1024,768]{1,0} %get-tuple-element.23, f32[768]{0} %constant_1783, f16[1024,768]{1,0} %custom-call.118), kind=kInput, calls=%fused_computation.190, metadata={op_type="Mean" op_name="bert/encoder/layer_10/attention/output/LayerNorm/moments/mean"}
2021-09-14 16:33:50.920531: I tensorflow/compiler/xla/service/executable.cc:221] 21656 cycles ( 0.13% 40Σ) :: 15.4 usec ( 5.1 optimal) :: 204.75GFLOP/s :: :: 477.31GiB/s :: 363B/cycle :: %fusion.206 = (f32[1024]{0}, f32[1024,768]{1,0}) fusion(f32[1024,768]{1,0} %get-tuple-element.51, f32[768]{0} %constant_1346, f16[1024,768]{1,0} %custom-call.87), kind=kInput, calls=%fused_computation.206, metadata={op_type="Mean" op_name="bert/encoder/layer_7/output/LayerNorm/moments/mean"}
2021-09-14 16:33:50.920537: I tensorflow/compiler/xla/service/executable.cc:221] 21656 cycles ( 0.13% 41Σ) :: 15.4 usec ( 5.1 optimal) :: 204.75GFLOP/s :: :: 477.31GiB/s :: 363B/cycle :: %fusion.242 = (f32[1024]{0}, f32[1024,768]{1,0}) fusion(f32[1024,768]{1,0} %get-tuple-element.123, f32[768]{0} %constant_308, f16[1024,768]{1,0} %custom-call.21), kind=kInput, calls=%fused_computation.242, metadata={op_type="Mean" op_name="bert/encoder/layer_1/output/LayerNorm/moments/mean"}
2021-09-14 16:33:50.920542: I tensorflow/compiler/xla/service/executable.cc:221] 20213 cycles ( 0.12% 41Σ) :: 14.3 usec ( 5.1 optimal) :: 219.37GFLOP/s :: :: 511.38GiB/s :: 389B/cycle :: %fusion.196 = (f32[1024]{0}, f32[1024,768]{1,0}) fusion(f32[1024,768]{1,0} %get-tuple-element.35, f32[768]{0} %constant_1610, f16[1024,768]{1,0} %custom-call.107), kind=kInput, calls=%fused_computation.196, metadata={op_type="Mean" op_name="bert/encoder/layer_9/attention/output/LayerNorm/moments/mean"}
2021-09-14 16:33:50.920550: I tensorflow/compiler/xla/service/executable.cc:221] 20213 cycles ( 0.12% 41Σ) :: 14.3 usec ( 5.1 optimal) :: 219.37GFLOP/s :: :: 511.38GiB/s :: 389B/cycle :: %fusion.208 = (f32[1024]{0}, f32[1024,768]{1,0}) fusion(f32[1024,768]{1,0} %get-tuple-element.59, f32[768]{0} %constant_1264, f16[1024,768]{1,0} %custom-call.85), kind=kInput, calls=%fused_computation.208, metadata={op_type="Mean" op_name="bert/encoder/layer_7/attention/output/LayerNorm/moments/mean"}
2021-09-14 16:33:50.920555: I tensorflow/compiler/xla/service/executable.cc:221] 20213 cycles ( 0.12% 41Σ) :: 14.3 usec ( 5.1 optimal) :: 219.37GFLOP/s :: :: 511.38GiB/s :: 389B/cycle :: %fusion.200 = (f32[1024]{0}, f32[1024,768]{1,0}) fusion(f32[1024,768]{1,0} %get-tuple-element.39, f32[768]{0} %constant_1519, f16[1024,768]{1,0} %custom-call.98), kind=kInput, calls=%fused_computation.200, metadata={op_type="Mean" op_name="bert/encoder/layer_8/output/LayerNorm/moments/mean"}
2021-09-14 16:33:50.920560: I tensorflow/compiler/xla/service/executable.cc:221] 20213 cycles ( 0.12% 41Σ) :: 14.3 usec ( 5.1 optimal) :: 219.37GFLOP/s :: :: 511.38GiB/s :: 389B/cycle :: %fusion.194 = (f32[1024]{0}, f32[1024,768]{1,0}) fusion(f32[1024,768]{1,0} %get-tuple-element.27, f32[768]{0} %constant_1692, f16[1024,768]{1,0} %custom-call.109), kind=kInput, calls=%fused_computation.194, metadata={op_type="Mean" op_name="bert/encoder/layer_9/output/LayerNorm/moments/mean"}
2021-09-14 16:33:50.920566: I tensorflow/compiler/xla/service/executable.cc:221] 20213 cycles ( 0.12% 41Σ) :: 14.3 usec ( 5.1 optimal) :: 219.37GFLOP/s :: :: 511.38GiB/s :: 389B/cycle :: %fusion.202 = (f32[1024]{0}, f32[1024,768]{1,0}) fusion(f32[1024,768]{1,0} %get-tuple-element.47, f32[768]{0} %constant_1437, f16[1024,768]{1,0} %custom-call.96), kind=kInput, calls=%fused_computation.202, metadata={op_type="Mean" op_name="bert/encoder/layer_8/attention/output/LayerNorm/moments/mean"}
2021-09-14 16:33:50.920571: I tensorflow/compiler/xla/service/executable.cc:221] 20213 cycles ( 0.12% 41Σ) :: 14.3 usec ( 5.1 optimal) :: 219.37GFLOP/s :: :: 511.38GiB/s :: 389B/cycle :: %fusion.214 = (f32[1024]{0}, f32[1024,768]{1,0}) fusion(f32[1024,768]{1,0} %get-tuple-element.71, f32[768]{0} %constant_1091, f16[1024,768]{1,0} %custom-call.74), kind=kInput, calls=%fused_computation.214, metadata={op_type="Mean" op_name="bert/encoder/layer_6/attention/output/LayerNorm/moments/mean"}
2021-09-14 16:33:50.920576: I tensorflow/compiler/xla/service/executable.cc:221] 20213 cycles ( 0.12% 41Σ) :: 14.3 usec ( 5.1 optimal) :: 219.37GFLOP/s :: :: 511.38GiB/s :: 389B/cycle :: %fusion.244 = (f32[1024]{0}, f32[1024,768]{1,0}) fusion(f32[1024,768]{1,0} %get-tuple-element.131, f32[768]{0} %constant_226, f16[1024,768]{1,0} %custom-call.19), kind=kInput, calls=%fused_computation.244, metadata={op_type="Mean" op_name="bert/encoder/layer_1/attention/output/LayerNorm/moments/mean"}
2021-09-14 16:33:50.920581: I tensorflow/compiler/xla/service/executable.cc:221] 20213 cycles ( 0.12% 41Σ) :: 14.3 usec ( 5.1 optimal) :: 219.37GFLOP/s :: :: 511.38GiB/s :: 389B/cycle :: %fusion.238 = (f32[1024]{0}, f32[1024,768]{1,0}) fusion(f32[1024,768]{1,0} %get-tuple-element.119, f32[768]{0} %constant_399, f16[1024,768]{1,0} %custom-call.30), kind=kInput, calls=%fused_computation.238, metadata={op_type="Mean" op_name="bert/encoder/layer_2/attention/output/LayerNorm/moments/mean"}
2021-09-14 16:33:50.920592: I tensorflow/compiler/xla/service/executable.cc:221] 20213 cycles ( 0.12% 42Σ) :: 14.3 usec ( 5.1 optimal) :: 219.37GFLOP/s :: :: 511.38GiB/s :: 389B/cycle :: %fusion.248 = (f32[1024]{0}, f32[1024,768]{1,0}) fusion(f32[1024,768]{1,0} %get-tuple-element.135, f32[768]{0} %constant_135, f16[1024,768]{1,0} %custom-call.10), kind=kInput, calls=%fused_computation.248, metadata={op_type="Mean" op_name="bert/encoder/layer_0/output/LayerNorm/moments/mean"}
2021-09-14 16:33:50.920597: I tensorflow/compiler/xla/service/executable.cc:221] 20213 cycles ( 0.12% 42Σ) :: 14.3 usec ( 4.1 optimal) :: 274.22GFLOP/s :: :: 409.20GiB/s :: 311B/cycle :: %fusion.250 = (f32[1024]{0}, f32[1024,768]{1,0}) fusion(f32[768]{0} %constant_53, f16[1024,768]{1,0} %custom-call.8, f16[1024,768]{1,0} %fusion.180), kind=kInput, calls=%fused_computation.250, metadata={op_type="Mean" op_name="bert/encoder/layer_0/attention/output/LayerNorm/moments/mean"}
2021-09-14 16:33:50.920603: I tensorflow/compiler/xla/service/executable.cc:221] 20213 cycles ( 0.12% 42Σ) :: 14.3 usec ( 1.0 optimal) :: :: :: 102.28GiB/s :: 77B/cycle :: %broadcast.42 = f16[1024,768]{1,0} broadcast(f16[768]{0} %constant_41), dimensions={1}, metadata={op_type="BiasAdd" op_name="bert/encoder/layer_0/attention/self/value/BiasAdd"}
2021-09-14 16:33:50.920608: I tensorflow/compiler/xla/service/executable.cc:221] 20213 cycles ( 0.12% 42Σ) :: 14.3 usec ( 5.1 optimal) :: 219.37GFLOP/s :: :: 511.38GiB/s :: 389B/cycle :: %fusion.182 = (f32[1024]{0}, f32[1024,768]{1,0}) fusion(f32[1024,768]{1,0} %get-tuple-element.3, f32[768]{0} %constant_2038, f16[1024,768]{1,0} %custom-call.131), kind=kInput, calls=%fused_computation.182, metadata={op_type="Mean" op_name="bert/encoder/layer_11/output/LayerNorm/moments/mean"}
2021-09-14 16:33:50.920613: I tensorflow/compiler/xla/service/executable.cc:221] 20213 cycles ( 0.12% 42Σ) :: 14.3 usec ( 5.1 optimal) :: 219.37GFLOP/s :: :: 511.38GiB/s :: 389B/cycle :: %fusion.232 = (f32[1024]{0}, f32[1024,768]{1,0}) fusion(f32[1024,768]{1,0} %get-tuple-element.107, f32[768]{0} %constant_572, f16[1024,768]{1,0} %custom-call.41), kind=kInput, calls=%fused_computation.232, metadata={op_type="Mean" op_name="bert/encoder/layer_3/attention/output/LayerNorm/moments/mean"}
2021-09-14 16:33:50.920619: I tensorflow/compiler/xla/service/executable.cc:221] 20213 cycles ( 0.12% 42Σ) :: 14.3 usec ( 5.1 optimal) :: 219.37GFLOP/s :: :: 511.38GiB/s :: 389B/cycle :: %fusion.230 = (f32[1024]{0}, f32[1024,768]{1,0}) fusion(f32[1024,768]{1,0} %get-tuple-element.99, f32[768]{0} %constant_654, f16[1024,768]{1,0} %custom-call.43), kind=kInput, calls=%fused_computation.230, metadata={op_type="Mean" op_name="bert/encoder/layer_3/output/LayerNorm/moments/mean"}
2021-09-14 16:33:50.920624: I tensorflow/compiler/xla/service/executable.cc:221] 20213 cycles ( 0.12% 42Σ) :: 14.3 usec ( 5.1 optimal) :: 219.37GFLOP/s :: :: 511.38GiB/s :: 389B/cycle :: %fusion.226 = (f32[1024]{0}, f32[1024,768]{1,0}) fusion(f32[1024,768]{1,0} %get-tuple-element.95, f32[768]{0} %constant_745, f16[1024,768]{1,0} %custom-call.52), kind=kInput, calls=%fused_computation.226, metadata={op_type="Mean" op_name="bert/encoder/layer_4/attention/output/LayerNorm/moments/mean"}
2021-09-14 16:33:50.920629: I tensorflow/compiler/xla/service/executable.cc:221] 20213 cycles ( 0.12% 42Σ) :: 14.3 usec ( 5.1 optimal) :: 219.37GFLOP/s :: :: 511.38GiB/s :: 389B/cycle :: %fusion.224 = (f32[1024]{0}, f32[1024,768]{1,0}) fusion(f32[1024,768]{1,0} %get-tuple-element.87, f32[768]{0} %constant_827, f16[1024,768]{1,0} %custom-call.54), kind=kInput, calls=%fused_computation.224, metadata={op_type="Mean" op_name="bert/encoder/layer_4/output/LayerNorm/moments/mean"}
2021-09-14 16:33:50.920638: I tensorflow/compiler/xla/service/executable.cc:221] 20213 cycles ( 0.12% 43Σ) :: 14.3 usec ( 5.1 optimal) :: 219.37GFLOP/s :: :: 511.38GiB/s :: 389B/cycle :: %fusion.188 = (f32[1024]{0}, f32[1024,768]{1,0}) fusion(f32[1024,768]{1,0} %get-tuple-element.15, f32[768]{0} %constant_1865, f16[1024,768]{1,0} %custom-call.120), kind=kInput, calls=%fused_computation.188, metadata={op_type="Mean" op_name="bert/encoder/layer_10/output/LayerNorm/moments/mean"}
2021-09-14 16:33:50.920644: I tensorflow/compiler/xla/service/executable.cc:221] 20213 cycles ( 0.12% 43Σ) :: 14.3 usec ( 5.1 optimal) :: 219.37GFLOP/s :: :: 511.38GiB/s :: 389B/cycle :: %fusion.220 = (f32[1024]{0}, f32[1024,768]{1,0}) fusion(f32[1024,768]{1,0} %get-tuple-element.83, f32[768]{0} %constant_918, f16[1024,768]{1,0} %custom-call.63), kind=kInput, calls=%fused_computation.220, metadata={op_type="Mean" op_name="bert/encoder/layer_5/attention/output/LayerNorm/moments/mean"}
2021-09-14 16:33:50.920649: I tensorflow/compiler/xla/service/executable.cc:221] 20213 cycles ( 0.12% 43Σ) :: 14.3 usec ( 5.1 optimal) :: 219.37GFLOP/s :: :: 511.38GiB/s :: 389B/cycle :: %fusion.184 = (f32[1024]{0}, f32[1024,768]{1,0}) fusion(f32[1024,768]{1,0} %get-tuple-element.11, f32[768]{0} %constant_1956, f16[1024,768]{1,0} %custom-call.129), kind=kInput, calls=%fused_computation.184, metadata={op_type="Mean" op_name="bert/encoder/layer_11/attention/output/LayerNorm/moments/mean"}
2021-09-14 16:33:50.920655: I tensorflow/compiler/xla/service/executable.cc:221] 20213 cycles ( 0.12% 43Σ) :: 14.3 usec ( 5.1 optimal) :: 219.37GFLOP/s :: :: 511.38GiB/s :: 389B/cycle :: %fusion.218 = (f32[1024]{0}, f32[1024,768]{1,0}) fusion(f32[1024,768]{1,0} %get-tuple-element.75, f32[768]{0} %constant_1000, f16[1024,768]{1,0} %custom-call.65), kind=kInput, calls=%fused_computation.218, metadata={op_type="Mean" op_name="bert/encoder/layer_5/output/LayerNorm/moments/mean"}
2021-09-14 16:33:50.920661: I tensorflow/compiler/xla/service/executable.cc:221] 18768 cycles ( 0.11% 43Σ) :: 13.3 usec ( 5.1 optimal) :: 354.73GFLOP/s :: 76.93MTROP/s :: 551.26GiB/s :: 419B/cycle :: %fusion.229 = (f16[1024,768]{1,0}, f32[1024,768]{1,0}) fusion(f32[1024]{0} %get-tuple-element.96, f32[768]{0} %constant_703, f32[1024,768]{1,0} %get-tuple-element.97, f32[768]{0} %constant_694, f32[1024]{0} %fusion.121), kind=kLoop, calls=%fused_computation.229, metadata={op_type="Cast" op_name="bert/encoder/layer_3/output/LayerNorm/batchnorm/add_1-0-CastToFp16-AutoMixedPrecision"}
2021-09-14 16:33:50.920666: I tensorflow/compiler/xla/service/executable.cc:221] 18768 cycles ( 0.11% 43Σ) :: 13.3 usec ( 2.0 optimal) :: 177.25GFLOP/s :: :: 220.67GiB/s :: 168B/cycle :: %fusion.106 = f32[1024]{0} fusion(f32[1024,768]{1,0} %get-tuple-element.85, f32[1024]{0} %get-tuple-element.84), kind=kInput, calls=%fused_computation.106, metadata={op_type="Mean" op_name="bert/encoder/layer_4/output/LayerNorm/moments/variance"}
2021-09-14 16:33:50.920671: I tensorflow/compiler/xla/service/executable.cc:221] 18768 cycles ( 0.11% 43Σ) :: 13.3 usec ( 2.0 optimal) :: 177.25GFLOP/s :: :: 220.67GiB/s :: 168B/cycle :: %fusion.141 = f32[1024]{0} fusion(f32[1024,768]{1,0} %get-tuple-element.113, f32[1024]{0} %get-tuple-element.112), kind=kInput, calls=%fused_computation.141, metadata={op_type="Mean" op_name="bert/encoder/layer_2/attention/output/LayerNorm/moments/variance"}
2021-09-14 16:33:50.920676: I tensorflow/compiler/xla/service/executable.cc:221] 18768 cycles ( 0.11% 43Σ) :: 13.3 usec ( 2.0 optimal) :: 177.25GFLOP/s :: :: 220.67GiB/s :: 168B/cycle :: %fusion.51 = f32[1024]{0} fusion(f32[1024,768]{1,0} %get-tuple-element.41, f32[1024]{0} %get-tuple-element.40), kind=kInput, calls=%fused_computation.51, metadata={op_type="Mean" op_name="bert/encoder/layer_8/attention/output/LayerNorm/moments/variance"}
2021-09-14 16:33:50.920685: I tensorflow/compiler/xla/service/executable.cc:221] 18768 cycles ( 0.11% 43Σ) :: 13.3 usec ( 5.1 optimal) :: 354.73GFLOP/s :: 76.93MTROP/s :: 551.26GiB/s :: 419B/cycle :: %fusion.237 = (f16[1024,768]{1,0}, f32[1024,768]{1,0}) fusion(f32[1024]{0} %get-tuple-element.112, f32[768]{0} %constant_448, f32[1024,768]{1,0} %get-tuple-element.113, f32[768]{0} %constant_439, f32[1024]{0} %fusion.141), kind=kLoop, calls=%fused_computation.237, metadata={op_type="Cast" op_name="bert/encoder/layer_2/attention/output/LayerNorm/batchnorm/add_1-0-CastToFp16-AutoMixedPrecision"}
2021-09-14 16:33:50.920691: I tensorflow/compiler/xla/service/executable.cc:221] 18768 cycles ( 0.11% 44Σ) :: 13.3 usec ( 2.0 optimal) :: 177.25GFLOP/s :: :: 220.67GiB/s :: 168B/cycle :: %fusion.21 = f32[1024]{0} fusion(f32[1024,768]{1,0} %get-tuple-element.17, f32[1024]{0} %get-tuple-element.16), kind=kInput, calls=%fused_computation.21, metadata={op_type="Mean" op_name="bert/encoder/layer_10/attention/output/LayerNorm/moments/variance"}
2021-09-14 16:33:50.920696: I tensorflow/compiler/xla/service/executable.cc:221] 18768 cycles ( 0.11% 44Σ) :: 13.3 usec ( 5.1 optimal) :: 354.73GFLOP/s :: 76.93MTROP/s :: 551.26GiB/s :: 419B/cycle :: %fusion.231 = (f16[1024,768]{1,0}, f32[1024,768]{1,0}) fusion(f32[1024]{0} %get-tuple-element.100, f32[768]{0} %constant_621, f32[1024,768]{1,0} %get-tuple-element.101, f32[768]{0} %constant_612, f32[1024]{0} %fusion.126), kind=kLoop, calls=%fused_computation.231, metadata={op_type="Cast" op_name="bert/encoder/layer_3/attention/output/LayerNorm/batchnorm/add_1-0-CastToFp16-AutoMixedPrecision"}
2021-09-14 16:33:50.920701: I tensorflow/compiler/xla/service/executable.cc:221] 18768 cycles ( 0.11% 44Σ) :: 13.3 usec ( 2.0 optimal) :: 177.25GFLOP/s :: :: 220.67GiB/s :: 168B/cycle :: %fusion.31 = f32[1024]{0} fusion(f32[1024,768]{1,0} %get-tuple-element.25, f32[1024]{0} %get-tuple-element.24), kind=kInput, calls=%fused_computation.31, metadata={op_type="Mean" op_name="bert/encoder/layer_9/output/LayerNorm/moments/variance"}
2021-09-14 16:33:50.920707: I tensorflow/compiler/xla/service/executable.cc:221] 17326 cycles ( 0.10% 44Σ) :: 12.3 usec ( 5.1 optimal) :: 384.25GFLOP/s :: 83.33MTROP/s :: 597.13GiB/s :: 454B/cycle :: %fusion.199 = (f16[1024,768]{1,0}, f32[1024,768]{1,0}) fusion(f32[1024]{0} %get-tuple-element.36, f32[768]{0} %constant_1568, f32[1024,768]{1,0} %get-tuple-element.37, f32[768]{0} %constant_1559, f32[1024]{0} %fusion.46), kind=kLoop, calls=%fused_computation.199, metadata={op_type="Cast" op_name="bert/encoder/layer_8/output/LayerNorm/batchnorm/add_1-0-CastToFp16-AutoMixedPrecision"}
2021-09-14 16:33:50.920714: I tensorflow/compiler/xla/service/executable.cc:221] 17326 cycles ( 0.10% 44Σ) :: 12.3 usec ( 2.0 optimal) :: 192.00GFLOP/s :: :: 239.04GiB/s :: 182B/cycle :: %fusion.81 = f32[1024]{0} fusion(f32[1024,768]{1,0} %get-tuple-element.65, f32[1024]{0} %get-tuple-element.64), kind=kInput, calls=%fused_computation.81, metadata={op_type="Mean" op_name="bert/encoder/layer_6/attention/output/LayerNorm/moments/variance"}
2021-09-14 16:33:50.920719: I tensorflow/compiler/xla/service/executable.cc:221] 17326 cycles ( 0.10% 44Σ) :: 12.3 usec ( 5.1 optimal) :: 384.25GFLOP/s :: 83.33MTROP/s :: 597.13GiB/s :: 454B/cycle :: %fusion.189 = (f16[1024,768]{1,0}, f32[1024,768]{1,0}) fusion(f32[1024]{0} %get-tuple-element.16, f32[768]{0} %constant_1832, f32[1024,768]{1,0} %get-tuple-element.17, f32[768]{0} %constant_1823, f32[1024]{0} %fusion.21), kind=kLoop, calls=%fused_computation.189, metadata={op_type="Cast" op_name="bert/encoder/layer_10/attention/output/LayerNorm/batchnorm/add_1-0-CastToFp16-AutoMixedPrecision"}
2021-09-14 16:33:50.920727: I tensorflow/compiler/xla/service/executable.cc:221] 17326 cycles ( 0.10% 44Σ) :: 12.3 usec ( 2.0 optimal) :: 192.00GFLOP/s :: :: 239.04GiB/s :: 182B/cycle :: %fusion.121 = f32[1024]{0} fusion(f32[1024,768]{1,0} %get-tuple-element.97, f32[1024]{0} %get-tuple-element.96), kind=kInput, calls=%fused_computation.121, metadata={op_type="Mean" op_name="bert/encoder/layer_3/output/LayerNorm/moments/variance"}
2021-09-14 16:33:50.920733: I tensorflow/compiler/xla/service/executable.cc:221] 17326 cycles ( 0.10% 44Σ) :: 12.3 usec ( 2.0 optimal) :: 192.00GFLOP/s :: :: 239.04GiB/s :: 182B/cycle :: %fusion.126 = f32[1024]{0} fusion(f32[1024,768]{1,0} %get-tuple-element.101, f32[1024]{0} %get-tuple-element.100), kind=kInput, calls=%fused_computation.126, metadata={op_type="Mean" op_name="bert/encoder/layer_3/attention/output/LayerNorm/moments/variance"}
2021-09-14 16:33:50.920738: I tensorflow/compiler/xla/service/executable.cc:221] 17326 cycles ( 0.10% 44Σ) :: 12.3 usec ( 5.1 optimal) :: 384.25GFLOP/s :: 83.33MTROP/s :: 597.13GiB/s :: 454B/cycle :: %fusion.201 = (f16[1024,768]{1,0}, f32[1024,768]{1,0}) fusion(f32[1024]{0} %get-tuple-element.40, f32[768]{0} %constant_1486, f32[1024,768]{1,0} %get-tuple-element.41, f32[768]{0} %constant_1477, f32[1024]{0} %fusion.51), kind=kLoop, calls=%fused_computation.201, metadata={op_type="Cast" op_name="bert/encoder/layer_8/attention/output/LayerNorm/batchnorm/add_1-0-CastToFp16-AutoMixedPrecision"}
2021-09-14 16:33:50.920743: I tensorflow/compiler/xla/service/executable.cc:221] 17326 cycles ( 0.10% 44Σ) :: 12.3 usec ( 2.0 optimal) :: 192.00GFLOP/s :: :: 239.04GiB/s :: 182B/cycle :: %fusion.136 = f32[1024]{0} fusion(f32[1024,768]{1,0} %get-tuple-element.109, f32[1024]{0} %get-tuple-element.108), kind=kInput, calls=%fused_computation.136, metadata={op_type="Mean" op_name="bert/encoder/layer_2/output/LayerNorm/moments/variance"}
2021-09-14 16:33:50.920749: I tensorflow/compiler/xla/service/executable.cc:221] 17326 cycles ( 0.10% 45Σ) :: 12.3 usec ( 2.0 optimal) :: 192.00GFLOP/s :: :: 239.04GiB/s :: 182B/cycle :: %fusion.151 = f32[1024]{0} fusion(f32[1024,768]{1,0} %get-tuple-element.121, f32[1024]{0} %get-tuple-element.120), kind=kInput, calls=%fused_computation.151, metadata={op_type="Mean" op_name="bert/encoder/layer_1/output/LayerNorm/moments/variance"}
2021-09-14 16:33:50.920754: I tensorflow/compiler/xla/service/executable.cc:221] 17326 cycles ( 0.10% 45Σ) :: 12.3 usec ( 5.1 optimal) :: 384.25GFLOP/s :: 83.33MTROP/s :: 597.13GiB/s :: 454B/cycle :: %fusion.247 = (f16[1024,768]{1,0}, f32[1024,768]{1,0}) fusion(f32[1024]{0} %get-tuple-element.132, f32[768]{0} %constant_184, f32[1024,768]{1,0} %get-tuple-element.133, f32[768]{0} %constant_175, f32[1024]{0} %fusion.166), kind=kLoop, calls=%fused_computation.247, metadata={op_type="Cast" op_name="bert/encoder/layer_0/output/LayerNorm/batchnorm/add_1-0-CastToFp16-AutoMixedPrecision"}
2021-09-14 16:33:50.920760: I tensorflow/compiler/xla/service/executable.cc:221] 17326 cycles ( 0.10% 45Σ) :: 12.3 usec ( 2.0 optimal) :: 192.00GFLOP/s :: :: 239.04GiB/s :: 182B/cycle :: %fusion.156 = f32[1024]{0} fusion(f32[1024,768]{1,0} %get-tuple-element.125, f32[1024]{0} %get-tuple-element.124), kind=kInput, calls=%fused_computation.156, metadata={op_type="Mean" op_name="bert/encoder/layer_1/attention/output/LayerNorm/moments/variance"}
2021-09-14 16:33:50.920769: I tensorflow/compiler/xla/service/executable.cc:221] 17326 cycles ( 0.10% 45Σ) :: 12.3 usec ( 5.1 optimal) :: 384.25GFLOP/s :: 83.33MTROP/s :: 597.13GiB/s :: 454B/cycle :: %fusion.183 = (f16[1024,768]{1,0}, f32[1024,768]{1,0}) fusion(f32[1024]{0} %get-tuple-element.4, f32[768]{0} %constant_2005, f32[1024,768]{1,0} %get-tuple-element.5, f32[768]{0} %constant_1996, f32[1024]{0} %fusion.6), kind=kLoop, calls=%fused_computation.183, metadata={op_type="Cast" op_name="bert/encoder/layer_11/attention/output/LayerNorm/batchnorm/add_1-0-CastToFp16-AutoMixedPrecision"}
2021-09-14 16:33:50.920775: I tensorflow/compiler/xla/service/executable.cc:221] 17326 cycles ( 0.10% 45Σ) :: 12.3 usec ( 5.1 optimal) :: 384.25GFLOP/s :: 83.33MTROP/s :: 597.13GiB/s :: 454B/cycle :: %fusion.187 = (f16[1024,768]{1,0}, f32[1024,768]{1,0}) fusion(f32[1024]{0} %get-tuple-element.12, f32[768]{0} %constant_1914, f32[1024,768]{1,0} %get-tuple-element.13, f32[768]{0} %constant_1905, f32[1024]{0} %fusion.16), kind=kLoop, calls=%fused_computation.187, metadata={op_type="Cast" op_name="bert/encoder/layer_10/output/LayerNorm/batchnorm/add_1-0-CastToFp16-AutoMixedPrecision"}
2021-09-14 16:33:50.920780: I tensorflow/compiler/xla/service/executable.cc:221] 17326 cycles ( 0.10% 45Σ) :: 12.3 usec ( 2.0 optimal) :: 192.00GFLOP/s :: :: 239.04GiB/s :: 182B/cycle :: %fusion.171 = f32[1024]{0} fusion(f32[1024,768]{1,0} %get-tuple-element.137, f32[1024]{0} %get-tuple-element.136), kind=kInput, calls=%fused_computation.171, metadata={op_type="Mean" op_name="bert/encoder/layer_0/attention/output/LayerNorm/moments/variance"}
2021-09-14 16:33:50.920786: I tensorflow/compiler/xla/service/executable.cc:221] 17326 cycles ( 0.10% 45Σ) :: 12.3 usec ( 2.0 optimal) :: 192.00GFLOP/s :: :: 239.04GiB/s :: 182B/cycle :: %fusion.166 = f32[1024]{0} fusion(f32[1024,768]{1,0} %get-tuple-element.133, f32[1024]{0} %get-tuple-element.132), kind=kInput, calls=%fused_computation.166, metadata={op_type="Mean" op_name="bert/encoder/layer_0/output/LayerNorm/moments/variance"}
2021-09-14 16:33:50.920791: I tensorflow/compiler/xla/service/executable.cc:221] 17326 cycles ( 0.10% 45Σ) :: 12.3 usec ( 5.1 optimal) :: 384.25GFLOP/s :: 83.33MTROP/s :: 597.13GiB/s :: 454B/cycle :: %fusion.249 = (f16[1024,768]{1,0}, f32[1024,768]{1,0}) fusion(f32[1024]{0} %get-tuple-element.136, f32[768]{0} %constant_102, f32[1024,768]{1,0} %get-tuple-element.137, f32[768]{0} %constant_93, f32[1024]{0} %fusion.171), kind=kLoop, calls=%fused_computation.249, metadata={op_type="Cast" op_name="bert/encoder/layer_0/attention/output/LayerNorm/batchnorm/add_1-0-CastToFp16-AutoMixedPrecision"}
2021-09-14 16:33:50.920796: I tensorflow/compiler/xla/service/executable.cc:221] 17326 cycles ( 0.10% 45Σ) :: 12.3 usec ( 5.1 optimal) :: 384.25GFLOP/s :: 83.33MTROP/s :: 597.13GiB/s :: 454B/cycle :: %fusion.243 = (f16[1024,768]{1,0}, f32[1024,768]{1,0}) fusion(f32[1024]{0} %get-tuple-element.124, f32[768]{0} %constant_275, f32[1024,768]{1,0} %get-tuple-element.125, f32[768]{0} %constant_266, f32[1024]{0} %fusion.156), kind=kLoop, calls=%fused_computation.243, metadata={op_type="Cast" op_name="bert/encoder/layer_1/attention/output/LayerNorm/batchnorm/add_1-0-CastToFp16-AutoMixedPrecision"}
2021-09-14 16:33:50.920802: I tensorflow/compiler/xla/service/executable.cc:221] 17326 cycles ( 0.10% 45Σ) :: 12.3 usec ( 5.1 optimal) :: 384.25GFLOP/s :: 83.33MTROP/s :: 597.13GiB/s :: 454B/cycle :: %fusion.217 = (f16[1024,768]{1,0}, f32[1024,768]{1,0}) fusion(f32[1024]{0} %get-tuple-element.72, f32[768]{0} %constant_1049, f32[1024,768]{1,0} %get-tuple-element.73, f32[768]{0} %constant_1040, f32[1024]{0} %fusion.91), kind=kLoop, calls=%fused_computation.217, metadata={op_type="Cast" op_name="bert/encoder/layer_5/output/LayerNorm/batchnorm/add_1-0-CastToFp16-AutoMixedPrecision"}
2021-09-14 16:33:50.920810: I tensorflow/compiler/xla/service/executable.cc:221] 17326 cycles ( 0.10% 46Σ) :: 12.3 usec ( 5.1 optimal) :: 384.25GFLOP/s :: 83.33MTROP/s :: 597.13GiB/s :: 454B/cycle :: %fusion.219 = (f16[1024,768]{1,0}, f32[1024,768]{1,0}) fusion(f32[1024]{0} %get-tuple-element.76, f32[768]{0} %constant_967, f32[1024,768]{1,0} %get-tuple-element.77, f32[768]{0} %constant_958, f32[1024]{0} %fusion.96), kind=kLoop, calls=%fused_computation.219, metadata={op_type="Cast" op_name="bert/encoder/layer_5/attention/output/LayerNorm/batchnorm/add_1-0-CastToFp16-AutoMixedPrecision"}
2021-09-14 16:33:50.920815: I tensorflow/compiler/xla/service/executable.cc:221] 17326 cycles ( 0.10% 46Σ) :: 12.3 usec ( 5.1 optimal) :: 384.25GFLOP/s :: 83.33MTROP/s :: 597.13GiB/s :: 454B/cycle :: %fusion.211 = (f16[1024,768]{1,0}, f32[1024,768]{1,0}) fusion(f32[1024]{0} %get-tuple-element.60, f32[768]{0} %constant_1222, f32[1024,768]{1,0} %get-tuple-element.61, f32[768]{0} %constant_1213, f32[1024]{0} %fusion.76), kind=kLoop, calls=%fused_computation.211, metadata={op_type="Cast" op_name="bert/encoder/layer_6/output/LayerNorm/batchnorm/add_1-0-CastToFp16-AutoMixedPrecision"}
2021-09-14 16:33:50.920821: I tensorflow/compiler/xla/service/executable.cc:221] 17326 cycles ( 0.10% 46Σ) :: 12.3 usec ( 5.1 optimal) :: 384.25GFLOP/s :: 83.33MTROP/s :: 597.13GiB/s :: 454B/cycle :: %fusion.223 = (f16[1024,768]{1,0}, f32[1024,768]{1,0}) fusion(f32[1024]{0} %get-tuple-element.84, f32[768]{0} %constant_876, f32[1024,768]{1,0} %get-tuple-element.85, f32[768]{0} %constant_867, f32[1024]{0} %fusion.106), kind=kLoop, calls=%fused_computation.223, metadata={op_type="Cast" op_name="bert/encoder/layer_4/output/LayerNorm/batchnorm/add_1-0-CastToFp16-AutoMixedPrecision"}
2021-09-14 16:33:50.920826: I tensorflow/compiler/xla/service/executable.cc:221] 17326 cycles ( 0.10% 46Σ) :: 12.3 usec ( 5.1 optimal) :: 384.25GFLOP/s :: 83.33MTROP/s :: 597.13GiB/s :: 454B/cycle :: %fusion.225 = (f16[1024,768]{1,0}, f32[1024,768]{1,0}) fusion(f32[1024]{0} %get-tuple-element.88, f32[768]{0} %constant_794, f32[1024,768]{1,0} %get-tuple-element.89, f32[768]{0} %constant_785, f32[1024]{0} %fusion.111), kind=kLoop, calls=%fused_computation.225, metadata={op_type="Cast" op_name="bert/encoder/layer_4/attention/output/LayerNorm/batchnorm/add_1-0-CastToFp16-AutoMixedPrecision"}
2021-09-14 16:33:50.920831: I tensorflow/compiler/xla/service/executable.cc:221] 17326 cycles ( 0.10% 46Σ) :: 12.3 usec ( 2.0 optimal) :: 192.00GFLOP/s :: :: 239.04GiB/s :: 182B/cycle :: %fusion.1 = f32[1024]{0} fusion(f32[1024,768]{1,0} %get-tuple-element.1, f32[1024]{0} %get-tuple-element), kind=kInput, calls=%fused_computation.1, metadata={op_type="Mean" op_name="bert/encoder/layer_11/output/LayerNorm/moments/variance"}
2021-09-14 16:33:50.920836: I tensorflow/compiler/xla/service/executable.cc:221] 17326 cycles ( 0.10% 46Σ) :: 12.3 usec ( 5.1 optimal) :: 384.25GFLOP/s :: 83.33MTROP/s :: 597.13GiB/s :: 454B/cycle :: %fusion.235 = (f16[1024,768]{1,0}, f32[1024,768]{1,0}) fusion(f32[1024]{0} %get-tuple-element.108, f32[768]{0} %constant_530, f32[1024,768]{1,0} %get-tuple-element.109, f32[768]{0} %constant_521, f32[1024]{0} %fusion.136), kind=kLoop, calls=%fused_computation.235, metadata={op_type="Cast" op_name="bert/encoder/layer_2/output/LayerNorm/batchnorm/add_1-0-CastToFp16-AutoMixedPrecision"}
2021-09-14 16:33:50.920842: I tensorflow/compiler/xla/service/executable.cc:221] 17326 cycles ( 0.10% 46Σ) :: 12.3 usec ( 2.0 optimal) :: 192.00GFLOP/s :: :: 239.04GiB/s :: 182B/cycle :: %fusion.6 = f32[1024]{0} fusion(f32[1024,768]{1,0} %get-tuple-element.5, f32[1024]{0} %get-tuple-element.4), kind=kInput, calls=%fused_computation.6, metadata={op_type="Mean" op_name="bert/encoder/layer_11/attention/output/LayerNorm/moments/variance"}
2021-09-14 16:33:50.920850: I tensorflow/compiler/xla/service/executable.cc:221] 17326 cycles ( 0.10% 46Σ) :: 12.3 usec ( 2.0 optimal) :: 192.00GFLOP/s :: :: 239.04GiB/s :: 182B/cycle :: %fusion.16 = f32[1024]{0} fusion(f32[1024,768]{1,0} %get-tuple-element.13, f32[1024]{0} %get-tuple-element.12), kind=kInput, calls=%fused_computation.16, metadata={op_type="Mean" op_name="bert/encoder/layer_10/output/LayerNorm/moments/variance"}
2021-09-14 16:33:50.920856: I tensorflow/compiler/xla/service/executable.cc:221] 17326 cycles ( 0.10% 46Σ) :: 12.3 usec ( 5.1 optimal) :: 384.25GFLOP/s :: 83.33MTROP/s :: 597.13GiB/s :: 454B/cycle :: %fusion.207 = (f16[1024,768]{1,0}, f32[1024,768]{1,0}) fusion(f32[1024]{0} %get-tuple-element.52, f32[768]{0} %constant_1313, f32[1024,768]{1,0} %get-tuple-element.53, f32[768]{0} %constant_1304, f32[1024]{0} %fusion.66), kind=kLoop, calls=%fused_computation.207, metadata={op_type="Cast" op_name="bert/encoder/layer_7/attention/output/LayerNorm/batchnorm/add_1-0-CastToFp16-AutoMixedPrecision"}
2021-09-14 16:33:50.920861: I tensorflow/compiler/xla/service/executable.cc:221] 17326 cycles ( 0.10% 46Σ) :: 12.3 usec ( 2.0 optimal) :: 192.00GFLOP/s :: :: 239.04GiB/s :: 182B/cycle :: %fusion.36 = f32[1024]{0} fusion(f32[1024,768]{1,0} %get-tuple-element.29, f32[1024]{0} %get-tuple-element.28), kind=kInput, calls=%fused_computation.36, metadata={op_type="Mean" op_name="bert/encoder/layer_9/attention/output/LayerNorm/moments/variance"}
2021-09-14 16:33:50.920866: I tensorflow/compiler/xla/service/executable.cc:221] 17326 cycles ( 0.10% 47Σ) :: 12.3 usec ( 2.0 optimal) :: 192.00GFLOP/s :: :: 239.04GiB/s :: 182B/cycle :: %fusion.46 = f32[1024]{0} fusion(f32[1024,768]{1,0} %get-tuple-element.37, f32[1024]{0} %get-tuple-element.36), kind=kInput, calls=%fused_computation.46, metadata={op_type="Mean" op_name="bert/encoder/layer_8/output/LayerNorm/moments/variance"}
2021-09-14 16:33:50.920871: I tensorflow/compiler/xla/service/executable.cc:221] 17326 cycles ( 0.10% 47Σ) :: 12.3 usec ( 5.1 optimal) :: 384.25GFLOP/s :: 83.33MTROP/s :: 597.13GiB/s :: 454B/cycle :: %fusion.205 = (f16[1024,768]{1,0}, f32[1024,768]{1,0}) fusion(f32[1024]{0} %get-tuple-element.48, f32[768]{0} %constant_1395, f32[1024,768]{1,0} %get-tuple-element.49, f32[768]{0} %constant_1386, f32[1024]{0} %fusion.61), kind=kLoop, calls=%fused_computation.205, metadata={op_type="Cast" op_name="bert/encoder/layer_7/output/LayerNorm/batchnorm/add_1-0-CastToFp16-AutoMixedPrecision"}
2021-09-14 16:33:50.920876: I tensorflow/compiler/xla/service/executable.cc:221] 17326 cycles ( 0.10% 47Σ) :: 12.3 usec ( 2.0 optimal) :: 192.00GFLOP/s :: :: 239.04GiB/s :: 182B/cycle :: %fusion.61 = f32[1024]{0} fusion(f32[1024,768]{1,0} %get-tuple-element.49, f32[1024]{0} %get-tuple-element.48), kind=kInput, calls=%fused_computation.61, metadata={op_type="Mean" op_name="bert/encoder/layer_7/output/LayerNorm/moments/variance"}
2021-09-14 16:33:50.920882: I tensorflow/compiler/xla/service/executable.cc:221] 17326 cycles ( 0.10% 47Σ) :: 12.3 usec ( 2.0 optimal) :: 192.00GFLOP/s :: :: 239.04GiB/s :: 182B/cycle :: %fusion.66 = f32[1024]{0} fusion(f32[1024,768]{1,0} %get-tuple-element.53, f32[1024]{0} %get-tuple-element.52), kind=kInput, calls=%fused_computation.66, metadata={op_type="Mean" op_name="bert/encoder/layer_7/attention/output/LayerNorm/moments/variance"}
2021-09-14 16:33:50.920887: I tensorflow/compiler/xla/service/executable.cc:221] 17326 cycles ( 0.10% 47Σ) :: 12.3 usec ( 2.0 optimal) :: 192.00GFLOP/s :: :: 239.04GiB/s :: 182B/cycle :: %fusion.76 = f32[1024]{0} fusion(f32[1024,768]{1,0} %get-tuple-element.61, f32[1024]{0} %get-tuple-element.60), kind=kInput, calls=%fused_computation.76, metadata={op_type="Mean" op_name="bert/encoder/layer_6/output/LayerNorm/moments/variance"}
2021-09-14 16:33:50.920895: I tensorflow/compiler/xla/service/executable.cc:221] 17326 cycles ( 0.10% 47Σ) :: 12.3 usec ( 5.1 optimal) :: 384.25GFLOP/s :: 83.33MTROP/s :: 597.13GiB/s :: 454B/cycle :: %fusion.241 = (f16[1024,768]{1,0}, f32[1024,768]{1,0}) fusion(f32[1024]{0} %get-tuple-element.120, f32[768]{0} %constant_357, f32[1024,768]{1,0} %get-tuple-element.121, f32[768]{0} %constant_348, f32[1024]{0} %fusion.151), kind=kLoop, calls=%fused_computation.241, metadata={op_type="Cast" op_name="bert/encoder/layer_1/output/LayerNorm/batchnorm/add_1-0-CastToFp16-AutoMixedPrecision"}
2021-09-14 16:33:50.920901: I tensorflow/compiler/xla/service/executable.cc:221] 17326 cycles ( 0.10% 47Σ) :: 12.3 usec ( 5.1 optimal) :: 384.25GFLOP/s :: 83.33MTROP/s :: 597.13GiB/s :: 454B/cycle :: %fusion.213 = (f16[1024,768]{1,0}, f32[1024,768]{1,0}) fusion(f32[1024]{0} %get-tuple-element.64, f32[768]{0} %constant_1140, f32[1024,768]{1,0} %get-tuple-element.65, f32[768]{0} %constant_1131, f32[1024]{0} %fusion.81), kind=kLoop, calls=%fused_computation.213, metadata={op_type="Cast" op_name="bert/encoder/layer_6/attention/output/LayerNorm/batchnorm/add_1-0-CastToFp16-AutoMixedPrecision"}
2021-09-14 16:33:50.920907: I tensorflow/compiler/xla/service/executable.cc:221] 17326 cycles ( 0.10% 47Σ) :: 12.3 usec ( 2.0 optimal) :: 192.00GFLOP/s :: :: 239.04GiB/s :: 182B/cycle :: %fusion.91 = f32[1024]{0} fusion(f32[1024,768]{1,0} %get-tuple-element.73, f32[1024]{0} %get-tuple-element.72), kind=kInput, calls=%fused_computation.91, metadata={op_type="Mean" op_name="bert/encoder/layer_5/output/LayerNorm/moments/variance"}
2021-09-14 16:33:50.920912: I tensorflow/compiler/xla/service/executable.cc:221] 17326 cycles ( 0.10% 47Σ) :: 12.3 usec ( 2.0 optimal) :: 192.00GFLOP/s :: :: 239.04GiB/s :: 182B/cycle :: %fusion.96 = f32[1024]{0} fusion(f32[1024,768]{1,0} %get-tuple-element.77, f32[1024]{0} %get-tuple-element.76), kind=kInput, calls=%fused_computation.96, metadata={op_type="Mean" op_name="bert/encoder/layer_5/attention/output/LayerNorm/moments/variance"}
2021-09-14 16:33:50.920918: I tensorflow/compiler/xla/service/executable.cc:221] 17326 cycles ( 0.10% 47Σ) :: 12.3 usec ( 2.0 optimal) :: 192.00GFLOP/s :: :: 239.04GiB/s :: 182B/cycle :: %fusion.111 = f32[1024]{0} fusion(f32[1024,768]{1,0} %get-tuple-element.89, f32[1024]{0} %get-tuple-element.88), kind=kInput, calls=%fused_computation.111, metadata={op_type="Mean" op_name="bert/encoder/layer_4/attention/output/LayerNorm/moments/variance"}
2021-09-14 16:33:50.920923: I tensorflow/compiler/xla/service/executable.cc:221] 15882 cycles ( 0.09% 48Σ) :: 11.3 usec ( 2.0 optimal) :: :: :: 260.10GiB/s :: 198B/cycle :: %fusion.70 = f16[8,12,128,64]{3,2,1,0} fusion(f16[1024,768]{1,0} %custom-call.83), kind=kLoop, calls=%fused_computation.70, metadata={op_type="Transpose" op_name="bert/encoder/layer_7/attention/self/transpose_2"}
2021-09-14 16:33:50.920928: I tensorflow/compiler/xla/service/executable.cc:221] 15882 cycles ( 0.09% 48Σ) :: 11.3 usec ( 2.0 optimal) :: :: :: 260.10GiB/s :: 198B/cycle :: %fusion.178 = f16[8,12,128,64]{3,2,1,0} fusion(f16[1024,768]{1,0} %custom-call.3), kind=kLoop, calls=%fused_computation.178, metadata={op_type="Transpose" op_name="bert/encoder/layer_0/attention/self/transpose_1"}
2021-09-14 16:33:50.920936: I tensorflow/compiler/xla/service/executable.cc:221] 15882 cycles ( 0.09% 48Σ) :: 11.3 usec ( 2.0 optimal) :: :: :: 260.10GiB/s :: 198B/cycle :: %fusion.149 = f16[8,12,128,64]{3,2,1,0} fusion(f16[1024,768]{1,0} %custom-call.23), kind=kLoop, calls=%fused_computation.149, metadata={op_type="Transpose" op_name="bert/encoder/layer_2/attention/self/transpose"}
2021-09-14 16:33:50.920942: I tensorflow/compiler/xla/service/executable.cc:221] 15882 cycles ( 0.09% 48Σ) :: 11.3 usec ( 4.1 optimal) :: 139.64GFLOP/s :: :: 522.23GiB/s :: 397B/cycle :: %fusion.131 = f16[8,12,128,128]{3,2,1,0} fusion(f16[8,12,128,128]{3,2,1,0} %get-tuple-element.103, f16[8,12,128]{2,1,0} %get-tuple-element.102), kind=kLoop, calls=%fused_computation.131, metadata={op_type="Softmax" op_name="bert/encoder/layer_3/attention/self/Softmax"}
2021-09-14 16:33:50.920947: I tensorflow/compiler/xla/service/executable.cc:221] 15882 cycles ( 0.09% 48Σ) :: 11.3 usec ( 1.0 optimal) :: :: :: 130.17GiB/s :: 99B/cycle :: %broadcast.1929 = f16[1024,768]{1,0} broadcast(f16[768]{0} %constant_1928), dimensions={1}, metadata={op_type="BiasAdd" op_name="bert/encoder/layer_11/attention/self/query/BiasAdd"}
2021-09-14 16:33:50.920952: I tensorflow/compiler/xla/service/executable.cc:221] 15882 cycles ( 0.09% 48Σ) :: 11.3 usec ( 5.1 optimal) :: 419.19GFLOP/s :: 90.91MTROP/s :: 651.43GiB/s :: 496B/cycle :: %fusion.193 = (f16[1024,768]{1,0}, f32[1024,768]{1,0}) fusion(f32[1024]{0} %get-tuple-element.24, f32[768]{0} %constant_1741, f32[1024,768]{1,0} %get-tuple-element.25, f32[768]{0} %constant_1732, f32[1024]{0} %fusion.31), kind=kLoop, calls=%fused_computation.193, metadata={op_type="Cast" op_name="bert/encoder/layer_9/output/LayerNorm/batchnorm/add_1-0-CastToFp16-AutoMixedPrecision"}
2021-09-14 16:33:50.920957: I tensorflow/compiler/xla/service/executable.cc:221] 15882 cycles ( 0.09% 48Σ) :: 11.3 usec ( 1.0 optimal) :: :: :: 130.17GiB/s :: 99B/cycle :: %broadcast.1945 = f16[1024,768]{1,0} broadcast(f16[768]{0} %constant_1944), dimensions={1}, metadata={op_type="BiasAdd" op_name="bert/encoder/layer_11/attention/self/value/BiasAdd"}
2021-09-14 16:33:50.920963: I tensorflow/compiler/xla/service/executable.cc:221] 15882 cycles ( 0.09% 48Σ) :: 11.3 usec ( 5.1 optimal) :: 419.19GFLOP/s :: 90.91MTROP/s :: 651.43GiB/s :: 496B/cycle :: %fusion.195 = (f16[1024,768]{1,0}, f32[1024,768]{1,0}) fusion(f32[1024]{0} %get-tuple-element.28, f32[768]{0} %constant_1659, f32[1024,768]{1,0} %get-tuple-element.29, f32[768]{0} %constant_1650, f32[1024]{0} %fusion.36), kind=kLoop, calls=%fused_computation.195, metadata={op_type="Cast" op_name="bert/encoder/layer_9/attention/output/LayerNorm/batchnorm/add_1-0-CastToFp16-AutoMixedPrecision"}
2021-09-14 16:33:50.920968: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 48Σ) :: 10.2 usec ( 2.0 optimal) :: :: :: 286.15GiB/s :: 217B/cycle :: %fusion.100 = f16[8,12,128,64]{3,2,1,0} fusion(f16[1024,768]{1,0} %custom-call.61), kind=kLoop, calls=%fused_computation.100, metadata={op_type="Transpose" op_name="bert/encoder/layer_5/attention/self/transpose_2"}
2021-09-14 16:33:50.920973: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 48Σ) :: 10.2 usec ( 1.0 optimal) :: :: :: 143.21GiB/s :: 109B/cycle :: %broadcast.884 = f16[1024,768]{1,0} broadcast(f16[768]{0} %constant_883), dimensions={1}, metadata={op_type="BiasAdd" op_name="bert/encoder/layer_5/attention/self/key/BiasAdd"}
2021-09-14 16:33:50.920978: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 48Σ) :: 10.2 usec ( 1.0 optimal) :: :: :: 143.21GiB/s :: 109B/cycle :: %broadcast.891 = f16[1024,768]{1,0} broadcast(f16[768]{0} %constant_890), dimensions={1}, metadata={op_type="BiasAdd" op_name="bert/encoder/layer_5/attention/self/query/BiasAdd"}
2021-09-14 16:33:50.920987: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 49Σ) :: 10.2 usec ( 4.1 optimal) :: 153.62GFLOP/s :: :: 574.54GiB/s :: 437B/cycle :: %fusion.71 = f16[8,12,128,128]{3,2,1,0} fusion(f16[8,12,128,128]{3,2,1,0} %get-tuple-element.55, f16[8,12,128]{2,1,0} %get-tuple-element.54), kind=kLoop, calls=%fused_computation.71, metadata={op_type="Softmax" op_name="bert/encoder/layer_7/attention/self/Softmax"}
2021-09-14 16:33:50.920993: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 49Σ) :: 10.2 usec ( 1.0 optimal) :: :: :: 143.21GiB/s :: 109B/cycle :: %broadcast.907 = f16[1024,768]{1,0} broadcast(f16[768]{0} %constant_906), dimensions={1}, metadata={op_type="BiasAdd" op_name="bert/encoder/layer_5/attention/self/value/BiasAdd"}
2021-09-14 16:33:50.920998: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 49Σ) :: 10.2 usec ( 2.0 optimal) :: :: :: 286.15GiB/s :: 217B/cycle :: %fusion.73 = f16[8,12,128,64]{3,2,1,0} fusion(f16[1024,768]{1,0} %custom-call.80), kind=kLoop, calls=%fused_computation.73, metadata={op_type="Transpose" op_name="bert/encoder/layer_7/attention/self/transpose_1"}
2021-09-14 16:33:50.921003: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 49Σ) :: 10.2 usec ( 1.0 optimal) :: :: :: 143.21GiB/s :: 109B/cycle :: %broadcast.372 = f16[1024,768]{1,0} broadcast(f16[768]{0} %constant_371), dimensions={1}, metadata={op_type="BiasAdd" op_name="bert/encoder/layer_2/attention/self/query/BiasAdd"}
2021-09-14 16:33:50.921008: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 49Σ) :: 10.2 usec ( 2.0 optimal) :: :: :: 286.15GiB/s :: 217B/cycle :: %fusion.85 = f16[8,12,128,64]{3,2,1,0} fusion(f16[1024,768]{1,0} %custom-call.72), kind=kLoop, calls=%fused_computation.85, metadata={op_type="Transpose" op_name="bert/encoder/layer_6/attention/self/transpose_2"}
2021-09-14 16:33:50.921013: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 49Σ) :: 10.2 usec ( 4.1 optimal) :: 153.62GFLOP/s :: :: 574.54GiB/s :: 437B/cycle :: %fusion.86 = f16[8,12,128,128]{3,2,1,0} fusion(f16[8,12,128,128]{3,2,1,0} %get-tuple-element.67, f16[8,12,128]{2,1,0} %get-tuple-element.66), kind=kLoop, calls=%fused_computation.86, metadata={op_type="Softmax" op_name="bert/encoder/layer_6/attention/self/Softmax"}
2021-09-14 16:33:50.921018: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 49Σ) :: 10.2 usec ( 1.0 optimal) :: :: :: 143.21GiB/s :: 109B/cycle :: %broadcast.1057 = f16[1024,768]{1,0} broadcast(f16[768]{0} %constant_1056), dimensions={1}, metadata={op_type="BiasAdd" op_name="bert/encoder/layer_6/attention/self/key/BiasAdd"}
2021-09-14 16:33:50.921023: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 49Σ) :: 10.2 usec ( 2.0 optimal) :: :: :: 286.15GiB/s :: 217B/cycle :: %fusion.88 = f16[8,12,128,64]{3,2,1,0} fusion(f16[1024,768]{1,0} %custom-call.69), kind=kLoop, calls=%fused_computation.88, metadata={op_type="Transpose" op_name="bert/encoder/layer_6/attention/self/transpose_1"}
2021-09-14 16:33:50.921028: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 49Σ) :: 10.2 usec ( 2.0 optimal) :: :: :: 286.15GiB/s :: 217B/cycle :: %fusion.89 = f16[8,12,128,64]{3,2,1,0} fusion(f16[1024,768]{1,0} %custom-call.67), kind=kLoop, calls=%fused_computation.89, metadata={op_type="Transpose" op_name="bert/encoder/layer_6/attention/self/transpose"}
2021-09-14 16:33:50.921037: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 49Σ) :: 10.2 usec ( 1.0 optimal) :: :: :: 143.21GiB/s :: 109B/cycle :: %broadcast.1080 = f16[1024,768]{1,0} broadcast(f16[768]{0} %constant_1079), dimensions={1}, metadata={op_type="BiasAdd" op_name="bert/encoder/layer_6/attention/self/value/BiasAdd"}
2021-09-14 16:33:50.921042: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 49Σ) :: 10.2 usec ( 2.0 optimal) :: :: :: 286.15GiB/s :: 217B/cycle :: %fusion.59 = f16[8,12,128,64]{3,2,1,0} fusion(f16[1024,768]{1,0} %custom-call.89), kind=kLoop, calls=%fused_computation.59, metadata={op_type="Transpose" op_name="bert/encoder/layer_8/attention/self/transpose"}
2021-09-14 16:33:50.921047: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 49Σ) :: 10.2 usec ( 4.1 optimal) :: 153.62GFLOP/s :: :: 574.54GiB/s :: 437B/cycle :: %fusion.101 = f16[8,12,128,128]{3,2,1,0} fusion(f16[8,12,128,128]{3,2,1,0} %get-tuple-element.79, f16[8,12,128]{2,1,0} %get-tuple-element.78), kind=kLoop, calls=%fused_computation.101, metadata={op_type="Softmax" op_name="bert/encoder/layer_5/attention/self/Softmax"}
2021-09-14 16:33:50.921052: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 50Σ) :: 10.2 usec ( 2.0 optimal) :: :: :: 286.15GiB/s :: 217B/cycle :: %fusion.103 = f16[8,12,128,64]{3,2,1,0} fusion(f16[1024,768]{1,0} %custom-call.58), kind=kLoop, calls=%fused_computation.103, metadata={op_type="Transpose" op_name="bert/encoder/layer_5/attention/self/transpose_1"}
2021-09-14 16:33:50.921057: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 50Σ) :: 10.2 usec ( 2.0 optimal) :: :: :: 286.15GiB/s :: 217B/cycle :: %fusion.104 = f16[8,12,128,64]{3,2,1,0} fusion(f16[1024,768]{1,0} %custom-call.56), kind=kLoop, calls=%fused_computation.104, metadata={op_type="Transpose" op_name="bert/encoder/layer_5/attention/self/transpose"}
2021-09-14 16:33:50.921063: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 50Σ) :: 10.2 usec ( 4.1 optimal) :: 153.62GFLOP/s :: :: 574.54GiB/s :: 437B/cycle :: %fusion.116 = f16[8,12,128,128]{3,2,1,0} fusion(f16[8,12,128,128]{3,2,1,0} %get-tuple-element.91, f16[8,12,128]{2,1,0} %get-tuple-element.90), kind=kLoop, calls=%fused_computation.116, metadata={op_type="Softmax" op_name="bert/encoder/layer_4/attention/self/Softmax"}
2021-09-14 16:33:50.921071: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 50Σ) :: 10.2 usec ( 2.0 optimal) :: :: :: 286.15GiB/s :: 217B/cycle :: %fusion.118 = f16[8,12,128,64]{3,2,1,0} fusion(f16[1024,768]{1,0} %custom-call.47), kind=kLoop, calls=%fused_computation.118, metadata={op_type="Transpose" op_name="bert/encoder/layer_4/attention/self/transpose_1"}
2021-09-14 16:33:50.921077: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 50Σ) :: 10.2 usec ( 2.0 optimal) :: :: :: 286.15GiB/s :: 217B/cycle :: %fusion.119 = f16[8,12,128,64]{3,2,1,0} fusion(f16[1024,768]{1,0} %custom-call.45), kind=kLoop, calls=%fused_computation.119, metadata={op_type="Transpose" op_name="bert/encoder/layer_4/attention/self/transpose"}
2021-09-14 16:33:50.921085: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 50Σ) :: 10.2 usec ( 1.0 optimal) :: :: :: 143.21GiB/s :: 109B/cycle :: %broadcast.1410 = f16[1024,768]{1,0} broadcast(f16[768]{0} %constant_1409), dimensions={1}, metadata={op_type="BiasAdd" op_name="bert/encoder/layer_8/attention/self/query/BiasAdd"}
2021-09-14 16:33:50.921090: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 50Σ) :: 10.2 usec ( 2.0 optimal) :: :: :: 286.15GiB/s :: 217B/cycle :: %fusion.130 = f16[8,12,128,64]{3,2,1,0} fusion(f16[1024,768]{1,0} %custom-call.39), kind=kLoop, calls=%fused_computation.130, metadata={op_type="Transpose" op_name="bert/encoder/layer_3/attention/self/transpose_2"}
2021-09-14 16:33:50.921096: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 50Σ) :: 10.2 usec ( 1.0 optimal) :: :: :: 143.21GiB/s :: 109B/cycle :: %broadcast.215 = f16[1024,768]{1,0} broadcast(f16[768]{0} %constant_214), dimensions={1}, metadata={op_type="BiasAdd" op_name="bert/encoder/layer_1/attention/self/value/BiasAdd"}
2021-09-14 16:33:50.921101: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 50Σ) :: 10.2 usec ( 1.0 optimal) :: :: :: 143.21GiB/s :: 109B/cycle :: %broadcast.1426 = f16[1024,768]{1,0} broadcast(f16[768]{0} %constant_1425), dimensions={1}, metadata={op_type="BiasAdd" op_name="bert/encoder/layer_8/attention/self/value/BiasAdd"}
2021-09-14 16:33:50.921106: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 50Σ) :: 10.2 usec ( 1.0 optimal) :: :: :: 143.21GiB/s :: 109B/cycle :: %broadcast.561 = f16[1024,768]{1,0} broadcast(f16[768]{0} %constant_560), dimensions={1}, metadata={op_type="BiasAdd" op_name="bert/encoder/layer_3/attention/self/value/BiasAdd"}
2021-09-14 16:33:50.921111: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 50Σ) :: 10.2 usec ( 4.1 optimal) :: 153.62GFLOP/s :: :: 574.54GiB/s :: 437B/cycle :: %fusion.26 = f16[8,12,128,128]{3,2,1,0} fusion(f16[8,12,128,128]{3,2,1,0} %get-tuple-element.19, f16[8,12,128]{2,1,0} %get-tuple-element.18), kind=kLoop, calls=%fused_computation.26, metadata={op_type="Softmax" op_name="bert/encoder/layer_10/attention/self/Softmax"}
2021-09-14 16:33:50.921116: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 50Σ) :: 10.2 usec ( 2.0 optimal) :: :: :: 286.15GiB/s :: 217B/cycle :: %fusion.25 = f16[8,12,128,64]{3,2,1,0} fusion(f16[1024,768]{1,0} %custom-call.116), kind=kLoop, calls=%fused_computation.25, metadata={op_type="Transpose" op_name="bert/encoder/layer_10/attention/self/transpose_2"}
2021-09-14 16:33:50.921121: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 51Σ) :: 10.2 usec ( 2.0 optimal) :: :: :: 286.15GiB/s :: 217B/cycle :: %fusion.24 = f16[1024,768]{1,0} fusion(f16[8,12,128,64]{3,2,1,0} %custom-call.117), kind=kLoop, calls=%fused_computation.24, metadata={op_type="Reshape" op_name="bert/encoder/layer_10/attention/self/Reshape_3"}
2021-09-14 16:33:50.921127: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 51Σ) :: 10.2 usec ( 1.0 optimal) :: :: :: 143.21GiB/s :: 109B/cycle :: %broadcast.538 = f16[1024,768]{1,0} broadcast(f16[768]{0} %constant_537), dimensions={1}, metadata={op_type="BiasAdd" op_name="bert/encoder/layer_3/attention/self/key/BiasAdd"}
2021-09-14 16:33:50.921135: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 51Σ) :: 10.2 usec ( 2.0 optimal) :: :: :: 286.15GiB/s :: 217B/cycle :: %fusion.28 = f16[8,12,128,64]{3,2,1,0} fusion(f16[1024,768]{1,0} %custom-call.113), kind=kLoop, calls=%fused_computation.28, metadata={op_type="Transpose" op_name="bert/encoder/layer_10/attention/self/transpose_1"}
2021-09-14 16:33:50.921140: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 51Σ) :: 10.2 usec ( 1.0 optimal) :: :: :: 143.21GiB/s :: 109B/cycle :: %broadcast.545 = f16[1024,768]{1,0} broadcast(f16[768]{0} %constant_544), dimensions={1}, metadata={op_type="BiasAdd" op_name="bert/encoder/layer_3/attention/self/query/BiasAdd"}
2021-09-14 16:33:50.921145: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 51Σ) :: 10.2 usec ( 2.0 optimal) :: :: :: 286.15GiB/s :: 217B/cycle :: %fusion.14 = f16[8,12,128,64]{3,2,1,0} fusion(f16[1024,768]{1,0} %custom-call.122), kind=kLoop, calls=%fused_computation.14, metadata={op_type="Transpose" op_name="bert/encoder/layer_11/attention/self/transpose"}
2021-09-14 16:33:50.921150: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 51Σ) :: 10.2 usec ( 2.0 optimal) :: :: :: 286.15GiB/s :: 217B/cycle :: %fusion.13 = f16[8,12,128,64]{3,2,1,0} fusion(f16[1024,768]{1,0} %custom-call.124), kind=kLoop, calls=%fused_computation.13, metadata={op_type="Transpose" op_name="bert/encoder/layer_11/attention/self/transpose_1"}
2021-09-14 16:33:50.921155: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 51Σ) :: 10.2 usec ( 4.1 optimal) :: 153.62GFLOP/s :: :: 574.54GiB/s :: 437B/cycle :: %fusion.11 = f16[8,12,128,128]{3,2,1,0} fusion(f16[8,12,128,128]{3,2,1,0} %get-tuple-element.7, f16[8,12,128]{2,1,0} %get-tuple-element.6), kind=kLoop, calls=%fused_computation.11, metadata={op_type="Softmax" op_name="bert/encoder/layer_11/attention/self/Softmax"}
2021-09-14 16:33:50.921160: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 51Σ) :: 10.2 usec ( 2.0 optimal) :: :: :: 286.15GiB/s :: 217B/cycle :: %fusion.10 = f16[8,12,128,64]{3,2,1,0} fusion(f16[1024,768]{1,0} %custom-call.127), kind=kLoop, calls=%fused_computation.10, metadata={op_type="Transpose" op_name="bert/encoder/layer_11/attention/self/transpose_2"}
2021-09-14 16:33:50.921165: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 51Σ) :: 10.2 usec ( 2.0 optimal) :: :: :: 286.15GiB/s :: 217B/cycle :: %fusion.29 = f16[8,12,128,64]{3,2,1,0} fusion(f16[1024,768]{1,0} %custom-call.111), kind=kLoop, calls=%fused_computation.29, metadata={op_type="Transpose" op_name="bert/encoder/layer_10/attention/self/transpose"}
2021-09-14 16:33:50.921171: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 51Σ) :: 10.2 usec ( 1.0 optimal) :: :: :: 143.21GiB/s :: 109B/cycle :: %broadcast.388 = f16[1024,768]{1,0} broadcast(f16[768]{0} %constant_387), dimensions={1}, metadata={op_type="BiasAdd" op_name="bert/encoder/layer_2/attention/self/value/BiasAdd"}
2021-09-14 16:33:50.921176: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 51Σ) :: 10.2 usec ( 2.0 optimal) :: :: :: 286.15GiB/s :: 217B/cycle :: %fusion.39 = f16[1024,768]{1,0} fusion(f16[8,12,128,64]{3,2,1,0} %custom-call.106), kind=kLoop, calls=%fused_computation.39, metadata={op_type="Reshape" op_name="bert/encoder/layer_9/attention/self/Reshape_3"}
2021-09-14 16:33:50.921184: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 51Σ) :: 10.2 usec ( 2.0 optimal) :: :: :: 286.15GiB/s :: 217B/cycle :: %fusion.40 = f16[8,12,128,64]{3,2,1,0} fusion(f16[1024,768]{1,0} %custom-call.105), kind=kLoop, calls=%fused_computation.40, metadata={op_type="Transpose" op_name="bert/encoder/layer_9/attention/self/transpose_2"}
2021-09-14 16:33:50.921189: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 52Σ) :: 10.2 usec ( 4.1 optimal) :: 153.62GFLOP/s :: :: 574.54GiB/s :: 437B/cycle :: %fusion.41 = f16[8,12,128,128]{3,2,1,0} fusion(f16[8,12,128,128]{3,2,1,0} %get-tuple-element.31, f16[8,12,128]{2,1,0} %get-tuple-element.30), kind=kLoop, calls=%fused_computation.41, metadata={op_type="Softmax" op_name="bert/encoder/layer_9/attention/self/Softmax"}
2021-09-14 16:33:50.921194: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 52Σ) :: 10.2 usec ( 2.0 optimal) :: :: :: 286.15GiB/s :: 217B/cycle :: %fusion.43 = f16[8,12,128,64]{3,2,1,0} fusion(f16[1024,768]{1,0} %custom-call.102), kind=kLoop, calls=%fused_computation.43, metadata={op_type="Transpose" op_name="bert/encoder/layer_9/attention/self/transpose_1"}
2021-09-14 16:33:50.921199: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 52Σ) :: 10.2 usec ( 2.0 optimal) :: :: :: 286.15GiB/s :: 217B/cycle :: %fusion.44 = f16[8,12,128,64]{3,2,1,0} fusion(f16[1024,768]{1,0} %custom-call.100), kind=kLoop, calls=%fused_computation.44, metadata={op_type="Transpose" op_name="bert/encoder/layer_9/attention/self/transpose"}
2021-09-14 16:33:50.921204: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 52Σ) :: 10.2 usec ( 1.0 optimal) :: :: :: 143.21GiB/s :: 109B/cycle :: %broadcast.718 = f16[1024,768]{1,0} broadcast(f16[768]{0} %constant_717), dimensions={1}, metadata={op_type="BiasAdd" op_name="bert/encoder/layer_4/attention/self/query/BiasAdd"}
2021-09-14 16:33:50.921209: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 52Σ) :: 10.2 usec ( 1.0 optimal) :: :: :: 143.21GiB/s :: 109B/cycle :: %broadcast.734 = f16[1024,768]{1,0} broadcast(f16[768]{0} %constant_733), dimensions={1}, metadata={op_type="BiasAdd" op_name="bert/encoder/layer_4/attention/self/value/BiasAdd"}
2021-09-14 16:33:50.921216: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 52Σ) :: 10.2 usec ( 2.0 optimal) :: :: :: 286.15GiB/s :: 217B/cycle :: %fusion.55 = f16[8,12,128,64]{3,2,1,0} fusion(f16[1024,768]{1,0} %custom-call.94), kind=kLoop, calls=%fused_computation.55, metadata={op_type="Transpose" op_name="bert/encoder/layer_8/attention/self/transpose_2"}
2021-09-14 16:33:50.921221: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 52Σ) :: 10.2 usec ( 4.1 optimal) :: 153.62GFLOP/s :: :: 574.54GiB/s :: 437B/cycle :: %fusion.56 = f16[8,12,128,128]{3,2,1,0} fusion(f16[8,12,128,128]{3,2,1,0} %get-tuple-element.43, f16[8,12,128]{2,1,0} %get-tuple-element.42), kind=kLoop, calls=%fused_computation.56, metadata={op_type="Softmax" op_name="bert/encoder/layer_8/attention/self/Softmax"}
2021-09-14 16:33:50.921226: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 52Σ) :: 10.2 usec ( 2.0 optimal) :: :: :: 286.15GiB/s :: 217B/cycle :: %fusion.58 = f16[8,12,128,64]{3,2,1,0} fusion(f16[1024,768]{1,0} %custom-call.91), kind=kLoop, calls=%fused_computation.58, metadata={op_type="Transpose" op_name="bert/encoder/layer_8/attention/self/transpose_1"}
2021-09-14 16:33:50.921234: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 52Σ) :: 10.2 usec ( 1.0 optimal) :: :: :: 143.21GiB/s :: 109B/cycle :: %broadcast.1922 = f16[1024,768]{1,0} broadcast(f16[768]{0} %constant_1921), dimensions={1}, metadata={op_type="BiasAdd" op_name="bert/encoder/layer_11/attention/self/key/BiasAdd"}
2021-09-14 16:33:50.921239: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 52Σ) :: 10.2 usec ( 2.0 optimal) :: :: :: 286.15GiB/s :: 217B/cycle :: %fusion.163 = f16[8,12,128,64]{3,2,1,0} fusion(f16[1024,768]{1,0} %custom-call.14), kind=kLoop, calls=%fused_computation.163, metadata={op_type="Transpose" op_name="bert/encoder/layer_1/attention/self/transpose_1"}
2021-09-14 16:33:50.921244: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 52Σ) :: 10.2 usec ( 4.1 optimal) :: 153.62GFLOP/s :: :: 574.54GiB/s :: 437B/cycle :: %fusion.146 = f16[8,12,128,128]{3,2,1,0} fusion(f16[8,12,128,128]{3,2,1,0} %get-tuple-element.115, f16[8,12,128]{2,1,0} %get-tuple-element.114), kind=kLoop, calls=%fused_computation.146, metadata={op_type="Softmax" op_name="bert/encoder/layer_2/attention/self/Softmax"}
2021-09-14 16:33:50.921250: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 52Σ) :: 10.2 usec ( 2.0 optimal) :: :: :: 286.15GiB/s :: 217B/cycle :: %fusion.148 = f16[8,12,128,64]{3,2,1,0} fusion(f16[1024,768]{1,0} %custom-call.25), kind=kLoop, calls=%fused_computation.148, metadata={op_type="Transpose" op_name="bert/encoder/layer_2/attention/self/transpose_1"}
2021-09-14 16:33:50.921255: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 53Σ) :: 10.2 usec ( 1.0 optimal) :: :: :: 143.21GiB/s :: 109B/cycle :: %broadcast.1583 = f16[1024,768]{1,0} broadcast(f16[768]{0} %constant_1582), dimensions={1}, metadata={op_type="BiasAdd" op_name="bert/encoder/layer_9/attention/self/query/BiasAdd"}
2021-09-14 16:33:50.921260: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 53Σ) :: 10.2 usec ( 1.0 optimal) :: :: :: 143.21GiB/s :: 109B/cycle :: %broadcast.1599 = f16[1024,768]{1,0} broadcast(f16[768]{0} %constant_1598), dimensions={1}, metadata={op_type="BiasAdd" op_name="bert/encoder/layer_9/attention/self/value/BiasAdd"}
2021-09-14 16:33:50.921265: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 53Σ) :: 10.2 usec ( 1.0 optimal) :: :: :: 143.21GiB/s :: 109B/cycle :: %broadcast.192 = f16[1024,768]{1,0} broadcast(f16[768]{0} %constant_191), dimensions={1}, metadata={op_type="BiasAdd" op_name="bert/encoder/layer_1/attention/self/key/BiasAdd"}
2021-09-14 16:33:50.921270: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 53Σ) :: 10.2 usec ( 2.0 optimal) :: :: :: 286.15GiB/s :: 217B/cycle :: %fusion.159 = f16[1024,768]{1,0} fusion(f16[8,12,128,64]{3,2,1,0} %custom-call.18), kind=kLoop, calls=%fused_computation.159, metadata={op_type="Reshape" op_name="bert/encoder/layer_1/attention/self/Reshape_3"}
2021-09-14 16:33:50.921275: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 53Σ) :: 10.2 usec ( 2.0 optimal) :: :: :: 286.15GiB/s :: 217B/cycle :: %fusion.160 = f16[8,12,128,64]{3,2,1,0} fusion(f16[1024,768]{1,0} %custom-call.17), kind=kLoop, calls=%fused_computation.160, metadata={op_type="Transpose" op_name="bert/encoder/layer_1/attention/self/transpose_2"}
2021-09-14 16:33:50.921284: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 53Σ) :: 10.2 usec ( 4.1 optimal) :: 153.62GFLOP/s :: :: 574.54GiB/s :: 437B/cycle :: %fusion.161 = f16[8,12,128,128]{3,2,1,0} fusion(f16[8,12,128,128]{3,2,1,0} %get-tuple-element.127, f16[8,12,128]{2,1,0} %get-tuple-element.126), kind=kLoop, calls=%fused_computation.161, metadata={op_type="Softmax" op_name="bert/encoder/layer_1/attention/self/Softmax"}
2021-09-14 16:33:50.921289: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 53Σ) :: 10.2 usec ( 2.0 optimal) :: :: :: 286.15GiB/s :: 217B/cycle :: %fusion.134 = f16[8,12,128,64]{3,2,1,0} fusion(f16[1024,768]{1,0} %custom-call.34), kind=kLoop, calls=%fused_computation.134, metadata={op_type="Transpose" op_name="bert/encoder/layer_3/attention/self/transpose"}
2021-09-14 16:33:50.921295: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 53Σ) :: 10.2 usec ( 2.0 optimal) :: :: :: 286.15GiB/s :: 217B/cycle :: %fusion.164 = f16[8,12,128,64]{3,2,1,0} fusion(f16[1024,768]{1,0} %custom-call.12), kind=kLoop, calls=%fused_computation.164, metadata={op_type="Transpose" op_name="bert/encoder/layer_1/attention/self/transpose"}
2021-09-14 16:33:50.921300: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 53Σ) :: 10.2 usec ( 1.0 optimal) :: :: :: 143.21GiB/s :: 109B/cycle :: %broadcast.1749 = f16[1024,768]{1,0} broadcast(f16[768]{0} %constant_1748), dimensions={1}, metadata={op_type="BiasAdd" op_name="bert/encoder/layer_10/attention/self/key/BiasAdd"}
2021-09-14 16:33:50.921306: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 53Σ) :: 10.2 usec ( 1.0 optimal) :: :: :: 143.21GiB/s :: 109B/cycle :: %broadcast.1756 = f16[1024,768]{1,0} broadcast(f16[768]{0} %constant_1755), dimensions={1}, metadata={op_type="BiasAdd" op_name="bert/encoder/layer_10/attention/self/query/BiasAdd"}
2021-09-14 16:33:50.921312: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 53Σ) :: 10.2 usec ( 1.0 optimal) :: :: :: 143.21GiB/s :: 109B/cycle :: %broadcast.1772 = f16[1024,768]{1,0} broadcast(f16[768]{0} %constant_1771), dimensions={1}, metadata={op_type="BiasAdd" op_name="bert/encoder/layer_10/attention/self/value/BiasAdd"}
2021-09-14 16:33:50.921321: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 53Σ) :: 10.2 usec ( 2.0 optimal) :: :: :: 286.15GiB/s :: 217B/cycle :: %fusion.174 = f16[1024,768]{1,0} fusion(f16[8,12,128,64]{3,2,1,0} %custom-call.7), kind=kLoop, calls=%fused_computation.174, metadata={op_type="Reshape" op_name="bert/encoder/layer_0/attention/self/Reshape_3"}
2021-09-14 16:33:50.921330: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 54Σ) :: 10.2 usec ( 2.0 optimal) :: :: :: 286.15GiB/s :: 217B/cycle :: %fusion.175 = f16[8,12,128,64]{3,2,1,0} fusion(f16[1024,768]{1,0} %custom-call.6), kind=kLoop, calls=%fused_computation.175, metadata={op_type="Transpose" op_name="bert/encoder/layer_0/attention/self/transpose_2"}
2021-09-14 16:33:50.921338: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 54Σ) :: 10.2 usec ( 4.1 optimal) :: 153.62GFLOP/s :: :: 574.54GiB/s :: 437B/cycle :: %fusion.176 = f16[8,12,128,128]{3,2,1,0} fusion(f16[8,12,128,128]{3,2,1,0} %get-tuple-element.139, f16[8,12,128]{2,1,0} %get-tuple-element.138), kind=kLoop, calls=%fused_computation.176, metadata={op_type="Softmax" op_name="bert/encoder/layer_0/attention/self/Softmax"}
2021-09-14 16:33:50.921350: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 54Σ) :: 10.2 usec ( 2.0 optimal) :: :: :: 286.15GiB/s :: 217B/cycle :: %fusion.144 = f16[1024,768]{1,0} fusion(f16[8,12,128,64]{3,2,1,0} %custom-call.29), kind=kLoop, calls=%fused_computation.144, metadata={op_type="Reshape" op_name="bert/encoder/layer_2/attention/self/Reshape_3"}
2021-09-14 16:33:50.921360: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 54Σ) :: 10.2 usec ( 2.0 optimal) :: :: :: 286.15GiB/s :: 217B/cycle :: %fusion.133 = f16[8,12,128,64]{3,2,1,0} fusion(f16[1024,768]{1,0} %custom-call.36), kind=kLoop, calls=%fused_computation.133, metadata={op_type="Transpose" op_name="bert/encoder/layer_3/attention/self/transpose_1"}
2021-09-14 16:33:50.921369: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 54Σ) :: 10.2 usec ( 2.0 optimal) :: :: :: 286.15GiB/s :: 217B/cycle :: %fusion.145 = f16[8,12,128,64]{3,2,1,0} fusion(f16[1024,768]{1,0} %custom-call.28), kind=kLoop, calls=%fused_computation.145, metadata={op_type="Transpose" op_name="bert/encoder/layer_2/attention/self/transpose_2"}
2021-09-14 16:33:50.921378: I tensorflow/compiler/xla/service/executable.cc:221] 14436 cycles ( 0.08% 54Σ) :: 10.2 usec ( 1.0 optimal) :: :: :: 143.21GiB/s :: 109B/cycle :: %broadcast.199 = f16[1024,768]{1,0} broadcast(f16[768]{0} %constant_198), dimensions={1}, metadata={op_type="BiasAdd" op_name="bert/encoder/layer_1/attention/self/query/BiasAdd"}
2021-09-14 16:33:50.921386: I tensorflow/compiler/xla/service/executable.cc:221] 12994 cycles ( 0.08% 54Σ) :: 9.2 usec ( 1.0 optimal) :: :: :: 159.11GiB/s :: 121B/cycle :: %broadcast.711 = f16[1024,768]{1,0} broadcast(f16[768]{0} %constant_710), dimensions={1}, metadata={op_type="BiasAdd" op_name="bert/encoder/layer_4/attention/self/key/BiasAdd"}
2021-09-14 16:33:50.921392: I tensorflow/compiler/xla/service/executable.cc:221] 12994 cycles ( 0.08% 54Σ) :: 9.2 usec ( 2.0 optimal) :: :: :: 317.90GiB/s :: 242B/cycle :: %fusion.69 = f16[1024,768]{1,0} fusion(f16[8,12,128,64]{3,2,1,0} %custom-call.84), kind=kLoop, calls=%fused_computation.69, metadata={op_type="Reshape" op_name="bert/encoder/layer_7/attention/self/Reshape_3"}
2021-09-14 16:33:50.921397: I tensorflow/compiler/xla/service/executable.cc:221] 12994 cycles ( 0.08% 54Σ) :: 9.2 usec ( 1.0 optimal) :: :: :: 159.11GiB/s :: 121B/cycle :: %broadcast.26 = f16[1024,768]{1,0} broadcast(f16[768]{0} %constant_25), dimensions={1}, metadata={op_type="BiasAdd" op_name="bert/encoder/layer_0/attention/self/query/BiasAdd"}
2021-09-14 16:33:50.921402: I tensorflow/compiler/xla/service/executable.cc:221] 12994 cycles ( 0.08% 54Σ) :: 9.2 usec ( 2.0 optimal) :: :: :: 317.90GiB/s :: 242B/cycle :: %fusion.9 = f16[1024,768]{1,0} fusion(f16[8,12,128,64]{3,2,1,0} %custom-call.128), kind=kLoop, calls=%fused_computation.9, metadata={op_type="Reshape" op_name="bert/encoder/layer_11/attention/self/Reshape_3"}
2021-09-14 16:33:50.921407: I tensorflow/compiler/xla/service/executable.cc:221] 12994 cycles ( 0.08% 54Σ) :: 9.2 usec ( 1.0 optimal) :: :: :: 159.11GiB/s :: 121B/cycle :: %broadcast.1403 = f16[1024,768]{1,0} broadcast(f16[768]{0} %constant_1402), dimensions={1}, metadata={op_type="BiasAdd" op_name="bert/encoder/layer_8/attention/self/key/BiasAdd"}
2021-09-14 16:33:50.921415: I tensorflow/compiler/xla/service/executable.cc:221] 12994 cycles ( 0.08% 54Σ) :: 9.2 usec ( 2.0 optimal) :: :: :: 317.90GiB/s :: 242B/cycle :: %fusion.74 = f16[8,12,128,64]{3,2,1,0} fusion(f16[1024,768]{1,0} %custom-call.78), kind=kLoop, calls=%fused_computation.74, metadata={op_type="Transpose" op_name="bert/encoder/layer_7/attention/self/transpose"}
2021-09-14 16:33:50.921420: I tensorflow/compiler/xla/service/executable.cc:221] 12994 cycles ( 0.08% 54Σ) :: 9.2 usec ( 2.0 optimal) :: :: :: 317.90GiB/s :: 242B/cycle :: %fusion.129 = f16[1024,768]{1,0} fusion(f16[8,12,128,64]{3,2,1,0} %custom-call.40), kind=kLoop, calls=%fused_computation.129, metadata={op_type="Reshape" op_name="bert/encoder/layer_3/attention/self/Reshape_3"}
2021-09-14 16:33:50.921425: I tensorflow/compiler/xla/service/executable.cc:221] 12994 cycles ( 0.08% 55Σ) :: 9.2 usec ( 2.0 optimal) :: :: :: 317.90GiB/s :: 242B/cycle :: %fusion.84 = f16[1024,768]{1,0} fusion(f16[8,12,128,64]{3,2,1,0} %custom-call.73), kind=kLoop, calls=%fused_computation.84, metadata={op_type="Reshape" op_name="bert/encoder/layer_6/attention/self/Reshape_3"}
2021-09-14 16:33:50.921430: I tensorflow/compiler/xla/service/executable.cc:221] 12994 cycles ( 0.08% 55Σ) :: 9.2 usec ( 1.0 optimal) :: :: :: 159.11GiB/s :: 121B/cycle :: %broadcast.19 = f16[1024,768]{1,0} broadcast(f16[768]{0} %constant_18), dimensions={1}, metadata={op_type="BiasAdd" op_name="bert/encoder/layer_0/attention/self/key/BiasAdd"}
2021-09-14 16:33:50.921436: I tensorflow/compiler/xla/service/executable.cc:221] 12994 cycles ( 0.08% 55Σ) :: 9.2 usec ( 2.0 optimal) :: :: :: 317.90GiB/s :: 242B/cycle :: %fusion.115 = f16[8,12,128,64]{3,2,1,0} fusion(f16[1024,768]{1,0} %custom-call.50), kind=kLoop, calls=%fused_computation.115, metadata={op_type="Transpose" op_name="bert/encoder/layer_4/attention/self/transpose_2"}
2021-09-14 16:33:50.921441: I tensorflow/compiler/xla/service/executable.cc:221] 12994 cycles ( 0.08% 55Σ) :: 9.2 usec ( 1.0 optimal) :: :: :: 159.11GiB/s :: 121B/cycle :: %broadcast.1064 = f16[1024,768]{1,0} broadcast(f16[768]{0} %constant_1063), dimensions={1}, metadata={op_type="BiasAdd" op_name="bert/encoder/layer_6/attention/self/query/BiasAdd"}
2021-09-14 16:33:50.921446: I tensorflow/compiler/xla/service/executable.cc:221] 12994 cycles ( 0.08% 55Σ) :: 9.2 usec ( 2.0 optimal) :: :: :: 317.90GiB/s :: 242B/cycle :: %fusion.114 = f16[1024,768]{1,0} fusion(f16[8,12,128,64]{3,2,1,0} %custom-call.51), kind=kLoop, calls=%fused_computation.114, metadata={op_type="Reshape" op_name="bert/encoder/layer_4/attention/self/Reshape_3"}
2021-09-14 16:33:50.921451: I tensorflow/compiler/xla/service/executable.cc:221] 12994 cycles ( 0.08% 55Σ) :: 9.2 usec ( 2.0 optimal) :: :: :: 317.90GiB/s :: 242B/cycle :: %fusion.54 = f16[1024,768]{1,0} fusion(f16[8,12,128,64]{3,2,1,0} %custom-call.95), kind=kLoop, calls=%fused_computation.54, metadata={op_type="Reshape" op_name="bert/encoder/layer_8/attention/self/Reshape_3"}
2021-09-14 16:33:50.921456: I tensorflow/compiler/xla/service/executable.cc:221] 12994 cycles ( 0.08% 55Σ) :: 9.2 usec ( 1.0 optimal) :: :: :: 159.11GiB/s :: 121B/cycle :: %broadcast.365 = f16[1024,768]{1,0} broadcast(f16[768]{0} %constant_364), dimensions={1}, metadata={op_type="BiasAdd" op_name="bert/encoder/layer_2/attention/self/key/BiasAdd"}
2021-09-14 16:33:50.921464: I tensorflow/compiler/xla/service/executable.cc:221] 12994 cycles ( 0.08% 55Σ) :: 9.2 usec ( 2.0 optimal) :: :: :: 317.90GiB/s :: 242B/cycle :: %fusion.99 = f16[1024,768]{1,0} fusion(f16[8,12,128,64]{3,2,1,0} %custom-call.62), kind=kLoop, calls=%fused_computation.99, metadata={op_type="Reshape" op_name="bert/encoder/layer_5/attention/self/Reshape_3"}
2021-09-14 16:33:50.921469: I tensorflow/compiler/xla/service/executable.cc:221] 12994 cycles ( 0.08% 55Σ) :: 9.2 usec ( 1.0 optimal) :: :: :: 159.11GiB/s :: 121B/cycle :: %broadcast.1253 = f16[1024,768]{1,0} broadcast(f16[768]{0} %constant_1252), dimensions={1}, metadata={op_type="BiasAdd" op_name="bert/encoder/layer_7/attention/self/value/BiasAdd"}
2021-09-14 16:33:50.921475: I tensorflow/compiler/xla/service/executable.cc:221] 12994 cycles ( 0.08% 55Σ) :: 9.2 usec ( 1.0 optimal) :: :: :: 159.11GiB/s :: 121B/cycle :: %broadcast.1576 = f16[1024,768]{1,0} broadcast(f16[768]{0} %constant_1575), dimensions={1}, metadata={op_type="BiasAdd" op_name="bert/encoder/layer_9/attention/self/key/BiasAdd"}
2021-09-14 16:33:50.921480: I tensorflow/compiler/xla/service/executable.cc:221] 12994 cycles ( 0.08% 55Σ) :: 9.2 usec ( 1.0 optimal) :: :: :: 159.11GiB/s :: 121B/cycle :: %broadcast.1237 = f16[1024,768]{1,0} broadcast(f16[768]{0} %constant_1236), dimensions={1}, metadata={op_type="BiasAdd" op_name="bert/encoder/layer_7/attention/self/query/BiasAdd"}
2021-09-14 16:33:50.921485: I tensorflow/compiler/xla/service/executable.cc:221] 12994 cycles ( 0.08% 55Σ) :: 9.2 usec ( 1.0 optimal) :: :: :: 159.11GiB/s :: 121B/cycle :: %broadcast.1230 = f16[1024,768]{1,0} broadcast(f16[768]{0} %constant_1229), dimensions={1}, metadata={op_type="BiasAdd" op_name="bert/encoder/layer_7/attention/self/key/BiasAdd"}
2021-09-14 16:33:50.921490: I tensorflow/compiler/xla/service/executable.cc:221] 10106 cycles ( 0.06% 55Σ) :: 7.2 usec ( 0.0 optimal) :: 139.52kFLOP/s :: :: 1.60MiB/s :: 0.001B/cycle :: %multiply.76 = f32[] multiply(f32[] %fusion, f32[] %constant_153), metadata={op_type="Mean" op_name="Mean"}
2021-09-14 16:33:50.921495: I tensorflow/compiler/xla/service/executable.cc:221]