1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h"
17 
18 #include <math.h>
19 
20 #include <algorithm>
21 #include <string>
22 #include <vector>
23 
24 #include "google/protobuf/any.pb.h"
25 #include "absl/container/flat_hash_map.h"
26 #include "absl/strings/match.h"
27 #include "absl/strings/str_cat.h"
28 #include "absl/strings/string_view.h"
29 #include "tensorflow/core/lib/gtl/map_util.h"
30 #include "tensorflow/core/platform/logging.h"
31 #include "tensorflow/core/platform/types.h"
32 #include "tensorflow/core/profiler/convert/op_metrics_to_record.h"
33 #include "tensorflow/core/profiler/convert/step_events_to_steps_db.h"
34 #include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
35 #include "tensorflow/core/profiler/protobuf/input_pipeline.pb.h"
36 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
37 #include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
38 #include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
39 #include "tensorflow/core/profiler/utils/diagnostics.h"
40 #include "tensorflow/core/profiler/utils/event_span.h"
41 #include "tensorflow/core/profiler/utils/format_utils.h"
42 #include "tensorflow/core/profiler/utils/hardware_type_utils.h"
43 #include "tensorflow/core/profiler/utils/html_utils.h"
44 #include "tensorflow/core/profiler/utils/math_utils.h"
45 #include "tensorflow/core/profiler/utils/op_metrics_db_utils.h"
46 #include "tensorflow/core/profiler/utils/tf_op_utils.h"
47 #include "tensorflow/core/util/stats_calculator.h"
48 
49 namespace tensorflow {
50 namespace profiler {
51 
52 namespace {
53 
54 const double kNumPsPerMs = 1000000000.0;
55 
56 // If the percentage of step time that is due to infeed is less than
57 // kModeratelyInfeedBoundThresholdInPercent, it is considered NOT
58 // input-bound; else if it is less than
59 // kHighlyInfeedBoundThresholdInPercent, it is considered MODERATELY
60 // input-bound; else if it is considered HIGHLY input-bound.
61 constexpr double kModeratelyInfeedBoundThresholdInPercent = 5;
62 constexpr double kHighlyInfeedBoundThresholdInPercent = 20;
63 
64 // If the percentage of step time that is due to outfeed is less than
65 // kModeratelyOutfeedBoundThresholdInPercent, it is considered NOT
66 // output-bound; else if it is less than
67 // kHighlyOutfeedBoundThresholdInPercent, it is considered MODERATELY
68 // output-bound; else if it is considered HIGHLY output-bound.
69 constexpr double kModeratelyOutfeedBoundThresholdInPercent = 5;
70 constexpr double kHighlyOutfeedBoundThresholdInPercent = 20;
71 
72 // If the percentage of step time that is due to kernel launch is less than
73 // kModeratelyKernelLaunchBoundThresholdInPercent, it is considered NOT
74 // kernel-launch bound; else if it is less than
75 // kHighlyKernelLaunchBoundThresholdInPercent, it is considered MODERATELY
76 // kernel-launch bound; else if it is considered HIGHLY kernel-launch bound.
77 constexpr double kModeratelyKernelLaunchBoundThresholdInPercent = 3;
78 constexpr double kHighlyKernelLaunchBoundThresholdInPercent = 15;
79 
80 // If the percentage of step time that is due to all other time is less than
81 // kModeratelyAllOtherBoundThresholdInPercent, it is considered NOT
82 // all-other bound; else if it is less than
83 // kHighlyAllOtherBoundThresholdInPercent, it is considered MODERATELY
84 // all-other bound; else if it is considered HIGHLY all-other bound.
85 constexpr double kModeratelyAllOtherBoundThresholdInPercent = 3;
86 constexpr double kHighlyAllOtherBoundThresholdInPercent = 15;
87 
88 // If the percentage of step time that is due to device collectives is less than
89 // kModeratelyDeviceCollectivesBoundThresholdInPercent, it is considered NOT
90 // device-collectives bound; else if it is less than
91 // kHighlyDeviceCollectivesBoundThresholdInPercent, it is considered MODERATELY
92 // device-collectives  bound; else if it is considered HIGHLY device-collectives
93 // bound.
94 constexpr double kModeratelyDeviceCollectivesBoundThresholdInPercent = 3;
95 constexpr double kHighlyDeviceCollectivesBoundThresholdInPercent = 15;
96 
97 // Section number of the host-analysis section in the input-pipeline analysis.
98 constexpr int kHostAnalysisSectionNumber = 3;
99 // Python-only explanation for "All Others" time.
100 const char* kAllOthersPythonExplanation =
101     " % of the total step time sampled is spent on 'All Others' time. "
102     "This could be due to Python execution overhead.";
103 // Explanation for "Kernel Launch" time due to CPU contention with tf.data.
104 const char* kKernelLaunchTfDataContention =
105     " It could be due to CPU contention with tf.data. In this case, you may "
106     "try to set the environment variable TF_GPU_THREAD_MODE=gpu_private.";
107 
108 template <class Collection>
GetTimeInMs(const Collection & type_ps,EventType event_type)109 double GetTimeInMs(const Collection& type_ps, EventType event_type) {
110   return PicoToMilli(gtl::FindWithDefault(type_ps, event_type, /*value=*/0));
111 }
112 
GetStepSummaryForSampleStats(const Stat<double> & sample_stats)113 StepSummary GetStepSummaryForSampleStats(const Stat<double>& sample_stats) {
114   StepSummary step_time_summary;
115   double avg, sdv, min, max;
116   if (sample_stats.empty()) {
117     // If sample_stats is empty, sample_stats.avg() will return NaN. However, we
118     // prefer to show an 0 instead.
119     avg = sdv = min = max = 0.0;
120   } else {
121     avg = sample_stats.avg();
122     sdv = sqrt(sample_stats.sample_variance());
123     min = sample_stats.min();
124     max = sample_stats.max();
125   }
126   step_time_summary.set_average(avg);
127   step_time_summary.set_standard_deviation(sdv);
128   step_time_summary.set_minimum(min);
129   step_time_summary.set_maximum(max);
130   return step_time_summary;
131 }
132 
ComputeGenericStepTimeBreakdownInMs(const InputPipelineAnalysisResult & analysis)133 GenericStepTimeBreakdown ComputeGenericStepTimeBreakdownInMs(
134     const InputPipelineAnalysisResult& analysis) {
135   Stat<double> unknown_time_ms;
136   Stat<double> host_wait_input_ms;
137   Stat<double> host_to_device_ms;
138   Stat<double> input_ms;
139   Stat<double> output_ms;
140   Stat<double> device_compute_ms;
141   Stat<double> device_to_device_ms;
142   Stat<double> device_collectives_ms;
143   Stat<double> host_compute_ms;
144   Stat<double> host_prepare_ms;
145   Stat<double> host_compile_ms;
146   GenericStepTimeBreakdown result;
147 
148   for (const google::protobuf::Any& step_details : analysis.step_details()) {
149     PerGenericStepDetails details;
150     bool success = step_details.UnpackTo(&details);
151     if (!success && !step_details.type_url().empty()) {
152       LOG(ERROR) << "Unable to unpack step_breakdown. Expected: generic"
153                  << std::endl;
154       return {};
155     }
156     unknown_time_ms.UpdateStat(details.unknown_time_ms());
157     host_wait_input_ms.UpdateStat(details.host_wait_input_ms());
158     host_to_device_ms.UpdateStat(details.host_to_device_ms());
159     input_ms.UpdateStat(details.host_wait_input_ms() +
160                         details.host_to_device_ms());
161     output_ms.UpdateStat(details.output_ms());
162     device_compute_ms.UpdateStat(details.device_compute_ms());
163     device_to_device_ms.UpdateStat(details.device_to_device_ms());
164     device_collectives_ms.UpdateStat(details.device_collectives_ms());
165     host_compute_ms.UpdateStat(details.host_compute_ms());
166     host_prepare_ms.UpdateStat(details.host_prepare_ms());
167     host_compile_ms.UpdateStat(details.host_compile_ms());
168   }
169   *result.mutable_unknown_time_ms_summary() =
170       GetStepSummaryForSampleStats(unknown_time_ms);
171   *result.mutable_host_wait_input_ms_summary() =
172       GetStepSummaryForSampleStats(host_wait_input_ms);
173   *result.mutable_host_to_device_ms_summary() =
174       GetStepSummaryForSampleStats(host_to_device_ms);
175   *result.mutable_input_ms_summary() = GetStepSummaryForSampleStats(input_ms);
176   *result.mutable_output_ms_summary() = GetStepSummaryForSampleStats(output_ms);
177   *result.mutable_device_compute_ms_summary() =
178       GetStepSummaryForSampleStats(device_compute_ms);
179   *result.mutable_device_to_device_ms_summary() =
180       GetStepSummaryForSampleStats(device_to_device_ms);
181   *result.mutable_device_collectives_ms_summary() =
182       GetStepSummaryForSampleStats(device_collectives_ms);
183   *result.mutable_host_compute_ms_summary() =
184       GetStepSummaryForSampleStats(host_compute_ms);
185   *result.mutable_host_prepare_ms_summary() =
186       GetStepSummaryForSampleStats(host_prepare_ms);
187   *result.mutable_host_compile_ms_summary() =
188       GetStepSummaryForSampleStats(host_compile_ms);
189   return result;
190 }
191 
ComputeGenericInputPipelineAnalysisResult(const protobuf::RepeatedPtrField<PerCoreStepInfo> & grouped_by_step)192 InputPipelineAnalysisResult ComputeGenericInputPipelineAnalysisResult(
193     const protobuf::RepeatedPtrField<PerCoreStepInfo>& grouped_by_step) {
194   InputPipelineAnalysisResult result;
195 
196   // Computes the summary of step time in ms.
197   *result.mutable_step_time_summary() =
198       ComputeStepTimeSummaryInMs(grouped_by_step);
199 
200   Stat<double> input_summary_stats_in_percent;
201   for (const auto& coreid_stepinfo_map : grouped_by_step) {
202     // Iterates over each step.
203     const auto* ptr = gtl::FindOrNull(coreid_stepinfo_map.step_info_per_core(),
204                                       kDefaultGpuLocalCoreId);
205     if (ptr == nullptr) {
206       // For generic hardware, all step-info is put under core-0. If ptr
207       // is nullptr, it means there is no step at all.
208       continue;
209     }
210     const StepInfoResult& step_info = *ptr;
211     // Adds the details for a new step.
212     PerGenericStepDetails details;
213     details.set_step_number(step_info.step_num());
214     if (step_info.step_name().empty()) {
215       details.set_step_name(absl::StrCat(step_info.step_num()));
216     } else {
217       details.set_step_name(step_info.step_name());
218     }
219     details.set_step_time_ms(PicoToMilli(step_info.duration_ps()));
220     GenericStepBreakdown generic;
221     bool success = step_info.step_breakdown().UnpackTo(&generic);
222     if (!success && !step_info.step_breakdown().type_url().empty()) {
223       LOG(ERROR) << "Unable to unpack step_breakdown. Expected: generic"
224                  << std::endl;
225       return {};
226     }
227     const auto& type_ps = generic.type_ps();
228     details.set_unknown_time_ms(GetTimeInMs(type_ps, UNKNOWN_TIME));
229     details.set_host_wait_input_ms(GetTimeInMs(type_ps, HOST_WAIT_INPUT));
230     details.set_host_to_device_ms(GetTimeInMs(type_ps, HOST_TO_DEVICE) +
231                                   GetTimeInMs(type_ps, DEVICE_WAIT_HOST));
232     details.set_output_ms(GetTimeInMs(type_ps, DEVICE_TO_HOST));
233     details.set_device_compute_ms(GetTimeInMs(type_ps, DEVICE_COMPUTE_16) +
234                                   GetTimeInMs(type_ps, DEVICE_COMPUTE_32));
235     details.set_device_to_device_ms(GetTimeInMs(type_ps, DEVICE_TO_DEVICE) +
236                                     GetTimeInMs(type_ps, DEVICE_WAIT_DEVICE));
237     details.set_device_collectives_ms(GetTimeInMs(type_ps, DEVICE_COLLECTIVES));
238     details.set_host_compute_ms(GetTimeInMs(type_ps, HOST_COMPUTE));
239     details.set_host_prepare_ms(GetTimeInMs(type_ps, HOST_PREPARE));
240     details.set_host_compile_ms(GetTimeInMs(type_ps, HOST_COMPILE));
241     result.add_step_details()->PackFrom(details);
242 
243     const double input_percent_of_step_time =
244         100.0 *
245         SafeDivide(details.host_wait_input_ms() + details.host_to_device_ms(),
246                    details.step_time_ms());
247     input_summary_stats_in_percent.UpdateStat(input_percent_of_step_time);
248   }
249 
250   // Computes the summary of input time as percentage of step time.
251   *result.mutable_input_percent_summary() =
252       GetStepSummaryForSampleStats(input_summary_stats_in_percent);
253 
254   // Computes the breakdown of step time.
255   GenericStepTimeBreakdown generic_step_time_breakdown =
256       ComputeGenericStepTimeBreakdownInMs(result);
257   result.mutable_step_time_breakdown()->PackFrom(generic_step_time_breakdown);
258 
259   return result;
260 }
261 
262 // Classification of input processing on the host.
263 enum class InputOpCategory {
264   kEnqueue,           // enqueue data to be transferred to device.
265   kDemandedFileRead,  // demanded read from file.
266   kAdvancedFileRead,  // advanced read from file (including cached,
267                       // prefetch, parallel-map, interleave).
268   kPreprocessing      // data preprocessing.
269 };
270 
InputOpCategoryString(InputOpCategory category)271 std::string InputOpCategoryString(InputOpCategory category) {
272   switch (category) {
273     case InputOpCategory::kEnqueue:
274       return "Enqueue";
275     case InputOpCategory::kDemandedFileRead:
276       return "Demanded file read";
277     case InputOpCategory::kAdvancedFileRead:
278       return "Advanced file read";
279     case InputOpCategory::kPreprocessing:
280       return "Preprocessing";
281   }
282 }
283 
IsInputOp(absl::string_view category)284 inline bool IsInputOp(absl::string_view category) {
285   // Do not include "IteratorGetNext*" here, because IteratorGetNext is an Op
286   // that experiences the install stall, not an Op that causes the input stall.
287   return IsInfeedEnqueueOp(category) || IsDatasetOp(category) ||
288          IsMemcpyHToDOp(category);
289 }
290 
291 // TODO(ckluk):
292 //   Confirm with the tf.data team if the classification below is correct.
CategorizeInputOp(absl::string_view name,absl::string_view category)293 InputOpCategory CategorizeInputOp(absl::string_view name,
294                                   absl::string_view category) {
295   if (IsInfeedEnqueueOp(category) || IsMemcpyHToDOp(category)) {
296     // Ops for sending input from host to device.
297     return InputOpCategory::kEnqueue;
298   }
299   DCHECK(IsDatasetOp(category));
300   if (absl::EndsWith(name, "::TFRecord") ||
301       absl::EndsWith(name, "::TextLine") ||
302       absl::EndsWith(name, "::FixedLengthRecord") ||
303       absl::EndsWith(name, "::SSTable") || absl::EndsWith(name, "::RecordIO")) {
304     // Ops that read files.
305     if (absl::StrContains(name, "::MemoryReader") ||
306         absl::StrContains(name, "::MemoryWriter") ||
307         absl::StrContains(name, "::Interleave") ||
308         absl::StrContains(name, "::Prefetch") ||
309         absl::StrContains(name, "::ParallelMap")) {
310       // Ops that read files in advance, including caching, interleaving, and
311       // prefetching.
312       return InputOpCategory::kAdvancedFileRead;
313     } else {
314       // Ops that read files on demand.
315       return InputOpCategory::kDemandedFileRead;
316     }
317   } else {
318     // All other ops are classified as preprocessing.
319     return InputOpCategory::kPreprocessing;
320   }
321 }
322 
323 struct InputOpMetrics {
324   std::vector<const OpMetrics*> input_op_metrics;
325   uint64 input_op_time_ps = 0;
326 };
327 
SelectInputOpMetrics(const OpMetricsDb & all_op_metrics)328 InputOpMetrics SelectInputOpMetrics(const OpMetricsDb& all_op_metrics) {
329   InputOpMetrics input_op_metrics;
330   for (const OpMetrics* op_metrics : SortedOpMetricsDb(all_op_metrics)) {
331     if (IsInputOp(op_metrics->category())) {
332       input_op_metrics.input_op_metrics.push_back(op_metrics);
333       input_op_metrics.input_op_time_ps += op_metrics->self_time_ps();
334     }
335   }
336   return input_op_metrics;
337 }
338 
ConvertOpMetricsToInputOpDetails(const OpMetrics & op_metrics,uint64 input_op_time_ps,InputOpCategory category)339 InputOpDetails ConvertOpMetricsToInputOpDetails(const OpMetrics& op_metrics,
340                                                 uint64 input_op_time_ps,
341                                                 InputOpCategory category) {
342   InputOpDetails details;
343   details.set_op_name(op_metrics.name());
344   details.set_count(op_metrics.occurrences());
345   details.set_time_in_ms(PicoToMilli(op_metrics.time_ps()));
346   details.set_self_time_in_ms(PicoToMilli(op_metrics.self_time_ps()));
347   details.set_time_in_percent(
348       100.0 * SafeDivide(op_metrics.time_ps(), input_op_time_ps));
349   details.set_self_time_in_percent(
350       100.0 * SafeDivide(op_metrics.self_time_ps(), input_op_time_ps));
351   details.set_category(InputOpCategoryString(category));
352   return details;
353 }
354 
355 // Returns the ratio of the host-to-device time in each step to the step-time.
RatioOfHostToDeviceTimeToStepTime(const OpMetricsDb & host_tf_metrics_db,const InputPipelineAnalysisResult & input_pipeline_analysis)356 double RatioOfHostToDeviceTimeToStepTime(
357     const OpMetricsDb& host_tf_metrics_db,
358     const InputPipelineAnalysisResult& input_pipeline_analysis) {
359   // For TPU execution that uses infeed.
360   absl::optional<double> host_infeed_enqueue_ratio =
361       HostInfeedEnqueueRatio(host_tf_metrics_db);
362   if (host_infeed_enqueue_ratio.has_value()) {
363     return host_infeed_enqueue_ratio.value();
364   }
365   // For GPU and TPU execution that do not use infeed.
366   double avg_step_time_ms =
367       input_pipeline_analysis.step_time_summary().average();
368   if (avg_step_time_ms > 0) {
369     // Uses the on-device step time.
370     GenericStepTimeBreakdown generic_breakdown;
371     if (input_pipeline_analysis.step_time_breakdown().UnpackTo(
372             &generic_breakdown)) {
373       double avg_host_to_device_time_ms =
374           generic_breakdown.host_to_device_ms_summary().average();
375       return SafeDivide(avg_host_to_device_time_ms, avg_step_time_ms);
376     }
377   }
378   return 0.0;
379 }
380 
DeviceCollectivesAnalysis(double device_collectives_percent,std::string * device_collectives_classification,std::string * device_collectives_statement)381 void DeviceCollectivesAnalysis(double device_collectives_percent,
382                                std::string* device_collectives_classification,
383                                std::string* device_collectives_statement) {
384   if (device_collectives_percent >=
385       kHighlyDeviceCollectivesBoundThresholdInPercent) {
386     *device_collectives_classification = "high";
387     *device_collectives_statement =
388         absl::StrCat(OneDigit(device_collectives_percent),
389                      " % of the total step time sampled is spent on 'Device "
390                      "Collective Communication'.");
391   } else if (device_collectives_percent >=
392              kModeratelyDeviceCollectivesBoundThresholdInPercent) {
393     *device_collectives_classification = "moderate";
394     *device_collectives_statement =
395         absl::StrCat(OneDigit(device_collectives_percent),
396                      " % of the total step time sampled is spent on 'Device "
397                      "Collective Communication'.");
398   } else {
399     *device_collectives_classification = "no";
400     *device_collectives_statement = "";
401   }
402 }
403 
KernelLaunchAnalysis(bool tfdata_used,double kernel_launch_percent,std::string * kernel_launch_classification,std::string * kernel_launch_statement)404 void KernelLaunchAnalysis(bool tfdata_used, double kernel_launch_percent,
405                           std::string* kernel_launch_classification,
406                           std::string* kernel_launch_statement) {
407   if (kernel_launch_percent >= kHighlyKernelLaunchBoundThresholdInPercent) {
408     *kernel_launch_classification = "high";
409     *kernel_launch_statement = absl::StrCat(
410         OneDigit(kernel_launch_percent),
411         " % of the total step time sampled is spent on 'Kernel Launch'.");
412     if (tfdata_used) {
413       absl::StrAppend(kernel_launch_statement, kKernelLaunchTfDataContention);
414     }
415   } else if (kernel_launch_percent >=
416              kModeratelyKernelLaunchBoundThresholdInPercent) {
417     *kernel_launch_classification = "moderate";
418     *kernel_launch_statement = absl::StrCat(
419         OneDigit(kernel_launch_percent),
420         " % of the total step time sampled is spent on 'Kernel Launch'.");
421     if (tfdata_used) {
422       absl::StrAppend(kernel_launch_statement, kKernelLaunchTfDataContention);
423     }
424   } else {
425     *kernel_launch_classification = "no";
426     *kernel_launch_statement = "";
427   }
428 }
429 
AllOtherAnalysis(bool all_other_reported,double all_other_percent,std::string * all_other_classification,std::string * all_other_statement)430 void AllOtherAnalysis(bool all_other_reported, double all_other_percent,
431                       std::string* all_other_classification,
432                       std::string* all_other_statement) {
433   if (all_other_reported) {
434     *all_other_classification = "no";
435     *all_other_statement = "";
436     return;
437   }
438   if (all_other_percent >= kHighlyAllOtherBoundThresholdInPercent) {
439     *all_other_classification = "high";
440     *all_other_statement =
441         absl::StrCat(OneDigit(all_other_percent), kAllOthersPythonExplanation);
442   } else if (all_other_percent >= kModeratelyAllOtherBoundThresholdInPercent) {
443     *all_other_classification = "moderate";
444     *all_other_statement =
445         absl::StrCat(OneDigit(all_other_percent), kAllOthersPythonExplanation);
446   } else {
447     *all_other_classification = "no";
448     *all_other_statement = "";
449   }
450 }
451 
452 // Tests if tf.data API is in use.
TfDataInUse(const InputTimeBreakdown & breakdown)453 bool TfDataInUse(const InputTimeBreakdown& breakdown) {
454   // Do not include enqueue_us because the "enqueue" Op that Xprof recognizes is
455   // not part of tf.data.
456   return breakdown.demanded_file_read_us() > 0 ||
457          breakdown.advanced_file_read_us() > 0 ||
458          breakdown.preprocessing_us() > 0;
459 }
460 
461 // Returns a HTML link with the given text.
MakeDocLink(absl::string_view doc_link,absl::string_view text)462 std::string MakeDocLink(absl::string_view doc_link, absl::string_view text) {
463   return absl::StrCat("<a href=\"", doc_link, "\" target=\"_blank\">", text,
464                       "</a>");
465 }
466 
467 // Returns the HTML link to the introduction to the tf.data API.
DatasetIntroDoc()468 std::string DatasetIntroDoc() {
469   return "https://www.tensorflow.org/guide/data";
470 }
471 
472 }  // namespace
473 
GenerateHostResult(const OpMetricsDb & host_tf_metrics_db,InputPipelineAnalysisResult * result)474 void GenerateHostResult(const OpMetricsDb& host_tf_metrics_db,
475                         InputPipelineAnalysisResult* result) {
476   InputOpMetrics input_op_metrics = SelectInputOpMetrics(host_tf_metrics_db);
477   // Returns if the program is not using an input pipeline with
478   // instrumentation and hence no input ops are found.
479   if (input_op_metrics.input_op_metrics.empty()) return;
480 
481   absl::flat_hash_map<InputOpCategory, double> aggregated_input_op_times_us;
482   for (const OpMetrics* op_metrics : input_op_metrics.input_op_metrics) {
483     InputOpCategory category =
484         CategorizeInputOp(op_metrics->name(), op_metrics->category());
485     *result->add_input_op_details() = ConvertOpMetricsToInputOpDetails(
486         *op_metrics, input_op_metrics.input_op_time_ps, category);
487     aggregated_input_op_times_us[category] +=
488         PicoToMicro(op_metrics->self_time_ps());
489   }
490 
491   double enqueue_time_us =
492       aggregated_input_op_times_us[InputOpCategory::kEnqueue];
493   double total_input_op_time_us =
494       aggregated_input_op_times_us[InputOpCategory::kDemandedFileRead] +
495       aggregated_input_op_times_us[InputOpCategory::kAdvancedFileRead] +
496       aggregated_input_op_times_us[InputOpCategory::kPreprocessing];
497 
498   double ratio = std::min(
499       1.0, RatioOfHostToDeviceTimeToStepTime(host_tf_metrics_db, *result));
500   DCHECK_GE(ratio, 0.0);
501   double non_enqueue_time_us = (ratio != 0.0)
502                                    ? (enqueue_time_us * (1.0 - ratio) / ratio)
503                                    : total_input_op_time_us;
504 
505   // Scales the various input-time components wrt to non_enqueue_time_us.
506   double scaled_demanded_fileread_time_us = SafeDivide(
507       non_enqueue_time_us *
508           aggregated_input_op_times_us[InputOpCategory::kDemandedFileRead],
509       total_input_op_time_us);
510   double scaled_advanced_fileread_time_us = SafeDivide(
511       non_enqueue_time_us *
512           aggregated_input_op_times_us[InputOpCategory::kAdvancedFileRead],
513       total_input_op_time_us);
514   double scaled_preprocessing_time_us = SafeDivide(
515       non_enqueue_time_us *
516           aggregated_input_op_times_us[InputOpCategory::kPreprocessing],
517       total_input_op_time_us);
518   double unclassified_non_enqueue_time_us = std::max(
519       0.0, non_enqueue_time_us - scaled_demanded_fileread_time_us -
520                scaled_advanced_fileread_time_us - scaled_preprocessing_time_us);
521 
522   InputTimeBreakdown* input_time_breakdown =
523       result->mutable_input_time_breakdown();
524   input_time_breakdown->set_enqueue_us(enqueue_time_us);
525   input_time_breakdown->set_demanded_file_read_us(
526       scaled_demanded_fileread_time_us);
527   input_time_breakdown->set_advanced_file_read_us(
528       scaled_advanced_fileread_time_us);
529   input_time_breakdown->set_preprocessing_us(scaled_preprocessing_time_us);
530   input_time_breakdown->set_unclassified_non_enqueue_us(
531       unclassified_non_enqueue_time_us);
532 }
533 
GenerateRecommendation()534 InputPipelineAnalysisRecommendation GenerateRecommendation() {
535   const absl::string_view kDatasetIntro =
536       "https://www.tensorflow.org/programmers_guide/datasets";
537 
538   const absl::string_view kDatasetTopic =
539       "https://www.tensorflow.org/api_docs/python/tf/data/Dataset#";
540 
541   const absl::string_view kTfRecordDataset =
542       "https://www.tensorflow.org/api_docs/python/tf/data/"
543       "TFRecordDataset#class_tfrecorddataset";
544 
545   InputPipelineAnalysisRecommendation recommendation;
546   *recommendation.add_details() =
547       "Enqueuing data: you may want to combine small input data chunks "
548       "into fewer "
549       "but larger chunks.";
550   *recommendation.add_details() = absl::StrCat(
551       "Data preprocessing: you may increase num_parallel_calls in ",
552       AnchorElement(absl::StrCat(kDatasetTopic, "map"), "Dataset map()"),
553       " or preprocess the data OFFLINE.");
554   *recommendation.add_details() = absl::StrCat(
555       "Reading data from files in advance: you may tune parameters in the "
556       "following tf.data API (",
557       AnchorElement(absl::StrCat(kDatasetTopic, "prefetch"), "prefetch size"),
558       ", ",
559       AnchorElement(absl::StrCat(kDatasetTopic, "interleave"),
560                     "interleave cycle_length"),
561       ", ", AnchorElement(kTfRecordDataset, "reader buffer_size"), ")");
562   *recommendation.add_details() = absl::StrCat(
563       "Reading data from files on demand: you should read data IN ADVANCE "
564       "using the following tf.data API (",
565       AnchorElement(absl::StrCat(kDatasetTopic, "prefetch"), "prefetch"), ", ",
566       AnchorElement(absl::StrCat(kDatasetTopic, "interleave"), "interleave"),
567       ", ", AnchorElement(kTfRecordDataset, "reader buffer"), ")");
568   *recommendation.add_details() = absl::StrCat(
569       "Other data reading or processing: you may consider using the ",
570       AnchorElement(kDatasetIntro, "tf.data API"),
571       " (if you are not using it now)");
572 
573   return recommendation;
574 }
575 
ComputeStepTimeSummaryInMs(const protobuf::RepeatedPtrField<PerCoreStepInfo> & grouped_by_step)576 StepSummary ComputeStepTimeSummaryInMs(
577     const protobuf::RepeatedPtrField<PerCoreStepInfo>& grouped_by_step) {
578   Stat<double> total_step_stats_in_ms;
579   // iterates over each step.
580   for (const auto& coreid_stepinfo_map : grouped_by_step) {
581     double max_per_step_stats_in_ms = 0.0;
582     // iterates over each core.
583     for (const auto& coreid_and_stepinfo :
584          coreid_stepinfo_map.step_info_per_core()) {
585       const auto& step_info = coreid_and_stepinfo.second;
586       max_per_step_stats_in_ms = std::max(step_info.duration_ps() / kNumPsPerMs,
587                                           max_per_step_stats_in_ms);
588     }
589     // Step time of each step is determined by the slowest core.
590     total_step_stats_in_ms.UpdateStat(max_per_step_stats_in_ms);
591   }
592 
593   return GetStepSummaryForSampleStats(total_step_stats_in_ms);
594 }
595 
ConvertOpStatsToInputPipelineAnalysis(const OpStats & op_stats)596 InputPipelineAnalysisResult ConvertOpStatsToInputPipelineAnalysis(
597     const OpStats& op_stats) {
598   InputPipelineAnalysisResult result =
599       ComputeGenericInputPipelineAnalysisResult(
600           op_stats.step_db().step_sequence());
601   PopulateStepDiagnostics(op_stats, result.mutable_diagnostics());
602   result.set_hardware_type(HardwareType_Name(
603       ParseHardwareType(op_stats.run_environment().device_type())));
604   GenerateHostResult(op_stats.host_op_metrics_db(), &result);
605 
606   InputPipelineAnalysisRecommendation recommendation = GenerateRecommendation();
607   BottleneckAnalysis bottleneck_analysis = ComputeBottleneckAnalysis(
608       result.input_time_breakdown(), result.step_details());
609   result.set_input_percent(bottleneck_analysis.input_percent());
610   result.set_output_percent(bottleneck_analysis.output_percent());
611   result.set_idle_percent(bottleneck_analysis.idle_percent());
612   result.set_compute_percent(bottleneck_analysis.compute_percent());
613 
614   recommendation.mutable_bottleneck_analysis()->PackFrom(bottleneck_analysis);
615   *recommendation.mutable_summary_next_step() =
616       GetSummaryNextStep(bottleneck_analysis.input_classification(),
617                          result.input_time_breakdown());
618 
619   *result.mutable_recommendation() = recommendation;
620   return result;
621 }
622 
InputAnalysis(double input_percent,double all_other_percent,std::string * input_classification,std::string * input_statement)623 bool InputAnalysis(double input_percent, double all_other_percent,
624                    std::string* input_classification,
625                    std::string* input_statement) {
626   absl::string_view non_input_time = "other time";
627   if (input_percent >= kHighlyInfeedBoundThresholdInPercent) {
628     *input_classification = "host";
629     *input_statement = absl::StrCat(
630         "Your program is HIGHLY input-bound because ", OneDigit(input_percent),
631         "% of the total step time sampled is waiting for input. Therefore, you "
632         "should first focus on reducing the input time.");
633     return false;
634   } else if (input_percent >= kModeratelyInfeedBoundThresholdInPercent) {
635     *input_classification = "both";
636     *input_statement = absl::StrCat(
637         "Your program is MODERATELY input-bound because ",
638         OneDigit(input_percent),
639         "% of the total step time sampled is waiting for input. Therefore, "
640         "you would need to reduce both the input time and ",
641         non_input_time, ".");
642     return false;
643   } else if (all_other_percent >= kModeratelyAllOtherBoundThresholdInPercent) {
644     // Input analysis says it is not input-bound, but "All-Other" time
645     // is significant. It could still be input-bound (or Python overhead).
646     *input_classification = "both";
647     *input_statement = absl::StrCat(
648         "Your program is POTENTIALLY input-bound because ",
649         OneDigit(all_other_percent),
650         "% of the total step time sampled is spent on 'All Others' time (which "
651         "could be due to I/O or Python execution or both).");
652     return true;
653   } else {
654     // Defintely not input-bound.
655     *input_classification = "device";
656     *input_statement =
657         absl::StrCat("Your program is NOT input-bound because only ",
658                      OneDigit(input_percent),
659                      "% of the total step time sampled is waiting for "
660                      "input. Therefore, you should focus on "
661                      "reducing ",
662                      non_input_time, ".");
663     return false;
664   }
665 }
666 
OutputAnalysis(double output_percent,std::string * output_classification,std::string * output_statement)667 void OutputAnalysis(double output_percent, std::string* output_classification,
668                     std::string* output_statement) {
669   if (output_percent >= kHighlyOutfeedBoundThresholdInPercent) {
670     *output_classification = "host";
671     *output_statement = absl::StrCat(
672         "Your program is HIGHLY output-bound because ",
673         OneDigit(output_percent),
674         "% of the total step time sampled is spent on output. Therefore, you "
675         "should first focus on reducing the output time.");
676   } else if (output_percent >= kModeratelyOutfeedBoundThresholdInPercent) {
677     *output_classification = "both";
678     *output_statement = absl::StrCat(
679         "Your program is MODERATELY output-bound because ",
680         OneDigit(output_percent),
681         "% of the total step time sampled is spent on output. Therefore, "
682         "you would need to reduce both the output time and other time.");
683   } else {
684     *output_classification = "device";
685     *output_statement = "";
686   }
687 }
688 
ComputeBottleneckAnalysis(const InputTimeBreakdown & input_time_breakdown,const::tensorflow::protobuf::RepeatedPtrField<::google::protobuf::Any> & any_step_details)689 BottleneckAnalysis ComputeBottleneckAnalysis(
690     const InputTimeBreakdown& input_time_breakdown,
691     const ::tensorflow::protobuf::RepeatedPtrField<::google::protobuf::Any>&
692         any_step_details) {
693   double total_step_time_ms = 0;
694   double total_input_ms = 0;
695   double total_output_ms = 0;
696   double total_host_compute_ms = 0;
697   double total_host_prepare_ms = 0;
698   double total_host_compile_ms = 0;
699   double total_device_compute_ms = 0;
700   double total_device_to_device_ms = 0;
701   double total_device_collectives_ms = 0;
702   double total_unknown_ms = 0;
703 
704   for (const google::protobuf::Any& step_details : any_step_details) {
705     PerGenericStepDetails details;
706     bool success = step_details.UnpackTo(&details);
707     if (!success && !step_details.type_url().empty()) {
708       LOG(ERROR) << "Unable to unpack step_breakdown. Expected: generic"
709                  << std::endl;
710       return {};
711     }
712     total_step_time_ms += details.step_time_ms();
713     total_input_ms +=
714         details.host_wait_input_ms() + details.host_to_device_ms();
715     total_output_ms += details.output_ms();
716     total_host_prepare_ms += details.host_prepare_ms();
717     total_device_compute_ms += details.device_compute_ms();
718     total_device_to_device_ms += details.device_to_device_ms();
719     total_device_collectives_ms += details.device_collectives_ms();
720     total_host_compute_ms += details.host_compute_ms();
721     total_host_compile_ms += details.host_compile_ms();
722     total_unknown_ms += details.unknown_time_ms();
723   }
724 
725   if (total_step_time_ms == 0) {
726     BottleneckAnalysis analysis;
727     analysis.set_input_classification("unknown");
728     analysis.set_input_statement(
729         "No step time measured. Therefore we cannot tell where the "
730         "performance bottleneck is.");
731     analysis.set_kernel_launch_classification("no");
732     analysis.set_kernel_launch_statement("");
733     analysis.set_all_other_classification("no");
734     analysis.set_all_other_statement("");
735     analysis.set_device_collectives_classification("no");
736     analysis.set_device_collectives_statement("");
737     return analysis;
738   }
739   double input_percent = 100.0 * total_input_ms / total_step_time_ms;
740   double output_percent = 100.0 * total_output_ms / total_step_time_ms;
741   double compute_percent = 100.0 * total_device_compute_ms / total_step_time_ms;
742   double device_collectives_percent =
743       100.0 * total_device_collectives_ms / total_step_time_ms;
744 
745   // idle_percent includes host_prepare (i.e. kernel launch, device-to-device,
746   // host compute, host compile, and unknown.
747   double idle_percent =
748       std::max(0.0, 100.0 - input_percent - output_percent - compute_percent -
749                         device_collectives_percent);
750   double kernel_launch_percent =
751       100.0 * total_host_prepare_ms / total_step_time_ms;
752   double all_other_percent = 100.0 * total_unknown_ms / total_step_time_ms;
753 
754   std::string input_classification;
755   std::string input_statement;
756   bool all_other_reported =
757       InputAnalysis(input_percent, all_other_percent, &input_classification,
758                     &input_statement);
759 
760   std::string device_collectives_classification;
761   std::string device_collectives_statement;
762   DeviceCollectivesAnalysis(device_collectives_percent,
763                             &device_collectives_classification,
764                             &device_collectives_statement);
765 
766   std::string kernel_launch_classification;
767   std::string kernel_launch_statement;
768   KernelLaunchAnalysis(TfDataInUse(input_time_breakdown), kernel_launch_percent,
769                        &kernel_launch_classification, &kernel_launch_statement);
770 
771   std::string all_other_classification;
772   std::string all_other_statement;
773   AllOtherAnalysis(all_other_reported, all_other_percent,
774                    &all_other_classification, &all_other_statement);
775 
776   BottleneckAnalysis analysis;
777   analysis.set_input_percent(input_percent);
778   analysis.set_output_percent(output_percent);
779   analysis.set_idle_percent(idle_percent);
780   analysis.set_compute_percent(compute_percent);
781 
782   analysis.set_input_classification(input_classification);
783   analysis.set_input_statement(input_statement);
784   analysis.set_kernel_launch_classification(kernel_launch_classification);
785   analysis.set_kernel_launch_statement(kernel_launch_statement);
786   analysis.set_all_other_classification(all_other_classification);
787   analysis.set_all_other_statement(all_other_statement);
788   analysis.set_device_collectives_classification(
789       device_collectives_classification);
790   analysis.set_device_collectives_statement(device_collectives_statement);
791 
792   return analysis;
793 }
794 
GetSummaryNextStep(absl::string_view input_classification,const InputTimeBreakdown & breakdown)795 std::string GetSummaryNextStep(absl::string_view input_classification,
796                                const InputTimeBreakdown& breakdown) {
797   std::string summary_next_step;
798   if (input_classification == "host" || input_classification == "both") {
799     if (!TfDataInUse(breakdown)) {
800       summary_next_step = absl::StrCat(
801           "Consider using ", MakeDocLink(DatasetIntroDoc(), "the tf.data API"),
802           " to enable profiler's host-side analysis for input pipeline. "
803           "Profiler currently does not support custom input pipeline (please "
804           "ignore "
805           "Section ",
806           kHostAnalysisSectionNumber, " below).");
807     } else {
808       summary_next_step =
809           absl::StrCat("Look at Section ", kHostAnalysisSectionNumber,
810                        " for the breakdown of input time on the host.");
811     }
812   } else {
813     summary_next_step = "You may skip the rest of this page.";
814   }
815 
816   return summary_next_step;
817 }
818 
HostToDeviceTransferAsPercentOfInputTime(const InputTimeBreakdown & breakdown)819 double HostToDeviceTransferAsPercentOfInputTime(
820     const InputTimeBreakdown& breakdown) {
821   // Thanks to the scaling trick we did in GenerateHostResult(), we can
822   // estimate the percentage of input-time spent on host-to-device transfer in
823   // the following way.
824   double total_input_time_us =
825       breakdown.demanded_file_read_us() + breakdown.advanced_file_read_us() +
826       breakdown.preprocessing_us() + breakdown.enqueue_us() +
827       breakdown.unclassified_non_enqueue_us();
828   return 100.0 * SafeDivide(breakdown.enqueue_us(), total_input_time_us);
829 }
830 
831 }  // namespace profiler
832 }  // namespace tensorflow
833