1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h"
17
18 #include <math.h>
19
20 #include <algorithm>
21 #include <string>
22 #include <vector>
23
24 #include "google/protobuf/any.pb.h"
25 #include "absl/container/flat_hash_map.h"
26 #include "absl/strings/match.h"
27 #include "absl/strings/str_cat.h"
28 #include "absl/strings/string_view.h"
29 #include "tensorflow/core/lib/gtl/map_util.h"
30 #include "tensorflow/core/platform/logging.h"
31 #include "tensorflow/core/platform/types.h"
32 #include "tensorflow/core/profiler/convert/op_metrics_to_record.h"
33 #include "tensorflow/core/profiler/convert/step_events_to_steps_db.h"
34 #include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
35 #include "tensorflow/core/profiler/protobuf/input_pipeline.pb.h"
36 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
37 #include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
38 #include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
39 #include "tensorflow/core/profiler/utils/diagnostics.h"
40 #include "tensorflow/core/profiler/utils/event_span.h"
41 #include "tensorflow/core/profiler/utils/format_utils.h"
42 #include "tensorflow/core/profiler/utils/hardware_type_utils.h"
43 #include "tensorflow/core/profiler/utils/html_utils.h"
44 #include "tensorflow/core/profiler/utils/math_utils.h"
45 #include "tensorflow/core/profiler/utils/op_metrics_db_utils.h"
46 #include "tensorflow/core/profiler/utils/tf_op_utils.h"
47 #include "tensorflow/core/util/stats_calculator.h"
48
49 namespace tensorflow {
50 namespace profiler {
51
52 namespace {
53
54 const double kNumPsPerMs = 1000000000.0;
55
56 // If the percentage of step time that is due to infeed is less than
57 // kModeratelyInfeedBoundThresholdInPercent, it is considered NOT
58 // input-bound; else if it is less than
59 // kHighlyInfeedBoundThresholdInPercent, it is considered MODERATELY
60 // input-bound; else if it is considered HIGHLY input-bound.
61 constexpr double kModeratelyInfeedBoundThresholdInPercent = 5;
62 constexpr double kHighlyInfeedBoundThresholdInPercent = 20;
63
64 // If the percentage of step time that is due to outfeed is less than
65 // kModeratelyOutfeedBoundThresholdInPercent, it is considered NOT
66 // output-bound; else if it is less than
67 // kHighlyOutfeedBoundThresholdInPercent, it is considered MODERATELY
68 // output-bound; else if it is considered HIGHLY output-bound.
69 constexpr double kModeratelyOutfeedBoundThresholdInPercent = 5;
70 constexpr double kHighlyOutfeedBoundThresholdInPercent = 20;
71
72 // If the percentage of step time that is due to kernel launch is less than
73 // kModeratelyKernelLaunchBoundThresholdInPercent, it is considered NOT
74 // kernel-launch bound; else if it is less than
75 // kHighlyKernelLaunchBoundThresholdInPercent, it is considered MODERATELY
76 // kernel-launch bound; else if it is considered HIGHLY kernel-launch bound.
77 constexpr double kModeratelyKernelLaunchBoundThresholdInPercent = 3;
78 constexpr double kHighlyKernelLaunchBoundThresholdInPercent = 15;
79
80 // If the percentage of step time that is due to all other time is less than
81 // kModeratelyAllOtherBoundThresholdInPercent, it is considered NOT
82 // all-other bound; else if it is less than
83 // kHighlyAllOtherBoundThresholdInPercent, it is considered MODERATELY
84 // all-other bound; else if it is considered HIGHLY all-other bound.
85 constexpr double kModeratelyAllOtherBoundThresholdInPercent = 3;
86 constexpr double kHighlyAllOtherBoundThresholdInPercent = 15;
87
88 // If the percentage of step time that is due to device collectives is less than
89 // kModeratelyDeviceCollectivesBoundThresholdInPercent, it is considered NOT
90 // device-collectives bound; else if it is less than
91 // kHighlyDeviceCollectivesBoundThresholdInPercent, it is considered MODERATELY
92 // device-collectives bound; else if it is considered HIGHLY device-collectives
93 // bound.
94 constexpr double kModeratelyDeviceCollectivesBoundThresholdInPercent = 3;
95 constexpr double kHighlyDeviceCollectivesBoundThresholdInPercent = 15;
96
97 // Section number of the host-analysis section in the input-pipeline analysis.
98 constexpr int kHostAnalysisSectionNumber = 3;
99 // Python-only explanation for "All Others" time.
100 const char* kAllOthersPythonExplanation =
101 " % of the total step time sampled is spent on 'All Others' time. "
102 "This could be due to Python execution overhead.";
103 // Explanation for "Kernel Launch" time due to CPU contention with tf.data.
104 const char* kKernelLaunchTfDataContention =
105 " It could be due to CPU contention with tf.data. In this case, you may "
106 "try to set the environment variable TF_GPU_THREAD_MODE=gpu_private.";
107
108 template <class Collection>
GetTimeInMs(const Collection & type_ps,EventType event_type)109 double GetTimeInMs(const Collection& type_ps, EventType event_type) {
110 return PicoToMilli(gtl::FindWithDefault(type_ps, event_type, /*value=*/0));
111 }
112
GetStepSummaryForSampleStats(const Stat<double> & sample_stats)113 StepSummary GetStepSummaryForSampleStats(const Stat<double>& sample_stats) {
114 StepSummary step_time_summary;
115 double avg, sdv, min, max;
116 if (sample_stats.empty()) {
117 // If sample_stats is empty, sample_stats.avg() will return NaN. However, we
118 // prefer to show an 0 instead.
119 avg = sdv = min = max = 0.0;
120 } else {
121 avg = sample_stats.avg();
122 sdv = sqrt(sample_stats.sample_variance());
123 min = sample_stats.min();
124 max = sample_stats.max();
125 }
126 step_time_summary.set_average(avg);
127 step_time_summary.set_standard_deviation(sdv);
128 step_time_summary.set_minimum(min);
129 step_time_summary.set_maximum(max);
130 return step_time_summary;
131 }
132
ComputeGenericStepTimeBreakdownInMs(const InputPipelineAnalysisResult & analysis)133 GenericStepTimeBreakdown ComputeGenericStepTimeBreakdownInMs(
134 const InputPipelineAnalysisResult& analysis) {
135 Stat<double> unknown_time_ms;
136 Stat<double> host_wait_input_ms;
137 Stat<double> host_to_device_ms;
138 Stat<double> input_ms;
139 Stat<double> output_ms;
140 Stat<double> device_compute_ms;
141 Stat<double> device_to_device_ms;
142 Stat<double> device_collectives_ms;
143 Stat<double> host_compute_ms;
144 Stat<double> host_prepare_ms;
145 Stat<double> host_compile_ms;
146 GenericStepTimeBreakdown result;
147
148 for (const google::protobuf::Any& step_details : analysis.step_details()) {
149 PerGenericStepDetails details;
150 bool success = step_details.UnpackTo(&details);
151 if (!success && !step_details.type_url().empty()) {
152 LOG(ERROR) << "Unable to unpack step_breakdown. Expected: generic"
153 << std::endl;
154 return {};
155 }
156 unknown_time_ms.UpdateStat(details.unknown_time_ms());
157 host_wait_input_ms.UpdateStat(details.host_wait_input_ms());
158 host_to_device_ms.UpdateStat(details.host_to_device_ms());
159 input_ms.UpdateStat(details.host_wait_input_ms() +
160 details.host_to_device_ms());
161 output_ms.UpdateStat(details.output_ms());
162 device_compute_ms.UpdateStat(details.device_compute_ms());
163 device_to_device_ms.UpdateStat(details.device_to_device_ms());
164 device_collectives_ms.UpdateStat(details.device_collectives_ms());
165 host_compute_ms.UpdateStat(details.host_compute_ms());
166 host_prepare_ms.UpdateStat(details.host_prepare_ms());
167 host_compile_ms.UpdateStat(details.host_compile_ms());
168 }
169 *result.mutable_unknown_time_ms_summary() =
170 GetStepSummaryForSampleStats(unknown_time_ms);
171 *result.mutable_host_wait_input_ms_summary() =
172 GetStepSummaryForSampleStats(host_wait_input_ms);
173 *result.mutable_host_to_device_ms_summary() =
174 GetStepSummaryForSampleStats(host_to_device_ms);
175 *result.mutable_input_ms_summary() = GetStepSummaryForSampleStats(input_ms);
176 *result.mutable_output_ms_summary() = GetStepSummaryForSampleStats(output_ms);
177 *result.mutable_device_compute_ms_summary() =
178 GetStepSummaryForSampleStats(device_compute_ms);
179 *result.mutable_device_to_device_ms_summary() =
180 GetStepSummaryForSampleStats(device_to_device_ms);
181 *result.mutable_device_collectives_ms_summary() =
182 GetStepSummaryForSampleStats(device_collectives_ms);
183 *result.mutable_host_compute_ms_summary() =
184 GetStepSummaryForSampleStats(host_compute_ms);
185 *result.mutable_host_prepare_ms_summary() =
186 GetStepSummaryForSampleStats(host_prepare_ms);
187 *result.mutable_host_compile_ms_summary() =
188 GetStepSummaryForSampleStats(host_compile_ms);
189 return result;
190 }
191
ComputeGenericInputPipelineAnalysisResult(const protobuf::RepeatedPtrField<PerCoreStepInfo> & grouped_by_step)192 InputPipelineAnalysisResult ComputeGenericInputPipelineAnalysisResult(
193 const protobuf::RepeatedPtrField<PerCoreStepInfo>& grouped_by_step) {
194 InputPipelineAnalysisResult result;
195
196 // Computes the summary of step time in ms.
197 *result.mutable_step_time_summary() =
198 ComputeStepTimeSummaryInMs(grouped_by_step);
199
200 Stat<double> input_summary_stats_in_percent;
201 for (const auto& coreid_stepinfo_map : grouped_by_step) {
202 // Iterates over each step.
203 const auto* ptr = gtl::FindOrNull(coreid_stepinfo_map.step_info_per_core(),
204 kDefaultGpuLocalCoreId);
205 if (ptr == nullptr) {
206 // For generic hardware, all step-info is put under core-0. If ptr
207 // is nullptr, it means there is no step at all.
208 continue;
209 }
210 const StepInfoResult& step_info = *ptr;
211 // Adds the details for a new step.
212 PerGenericStepDetails details;
213 details.set_step_number(step_info.step_num());
214 if (step_info.step_name().empty()) {
215 details.set_step_name(absl::StrCat(step_info.step_num()));
216 } else {
217 details.set_step_name(step_info.step_name());
218 }
219 details.set_step_time_ms(PicoToMilli(step_info.duration_ps()));
220 GenericStepBreakdown generic;
221 bool success = step_info.step_breakdown().UnpackTo(&generic);
222 if (!success && !step_info.step_breakdown().type_url().empty()) {
223 LOG(ERROR) << "Unable to unpack step_breakdown. Expected: generic"
224 << std::endl;
225 return {};
226 }
227 const auto& type_ps = generic.type_ps();
228 details.set_unknown_time_ms(GetTimeInMs(type_ps, UNKNOWN_TIME));
229 details.set_host_wait_input_ms(GetTimeInMs(type_ps, HOST_WAIT_INPUT));
230 details.set_host_to_device_ms(GetTimeInMs(type_ps, HOST_TO_DEVICE) +
231 GetTimeInMs(type_ps, DEVICE_WAIT_HOST));
232 details.set_output_ms(GetTimeInMs(type_ps, DEVICE_TO_HOST));
233 details.set_device_compute_ms(GetTimeInMs(type_ps, DEVICE_COMPUTE_16) +
234 GetTimeInMs(type_ps, DEVICE_COMPUTE_32));
235 details.set_device_to_device_ms(GetTimeInMs(type_ps, DEVICE_TO_DEVICE) +
236 GetTimeInMs(type_ps, DEVICE_WAIT_DEVICE));
237 details.set_device_collectives_ms(GetTimeInMs(type_ps, DEVICE_COLLECTIVES));
238 details.set_host_compute_ms(GetTimeInMs(type_ps, HOST_COMPUTE));
239 details.set_host_prepare_ms(GetTimeInMs(type_ps, HOST_PREPARE));
240 details.set_host_compile_ms(GetTimeInMs(type_ps, HOST_COMPILE));
241 result.add_step_details()->PackFrom(details);
242
243 const double input_percent_of_step_time =
244 100.0 *
245 SafeDivide(details.host_wait_input_ms() + details.host_to_device_ms(),
246 details.step_time_ms());
247 input_summary_stats_in_percent.UpdateStat(input_percent_of_step_time);
248 }
249
250 // Computes the summary of input time as percentage of step time.
251 *result.mutable_input_percent_summary() =
252 GetStepSummaryForSampleStats(input_summary_stats_in_percent);
253
254 // Computes the breakdown of step time.
255 GenericStepTimeBreakdown generic_step_time_breakdown =
256 ComputeGenericStepTimeBreakdownInMs(result);
257 result.mutable_step_time_breakdown()->PackFrom(generic_step_time_breakdown);
258
259 return result;
260 }
261
262 // Classification of input processing on the host.
263 enum class InputOpCategory {
264 kEnqueue, // enqueue data to be transferred to device.
265 kDemandedFileRead, // demanded read from file.
266 kAdvancedFileRead, // advanced read from file (including cached,
267 // prefetch, parallel-map, interleave).
268 kPreprocessing // data preprocessing.
269 };
270
InputOpCategoryString(InputOpCategory category)271 std::string InputOpCategoryString(InputOpCategory category) {
272 switch (category) {
273 case InputOpCategory::kEnqueue:
274 return "Enqueue";
275 case InputOpCategory::kDemandedFileRead:
276 return "Demanded file read";
277 case InputOpCategory::kAdvancedFileRead:
278 return "Advanced file read";
279 case InputOpCategory::kPreprocessing:
280 return "Preprocessing";
281 }
282 }
283
IsInputOp(absl::string_view category)284 inline bool IsInputOp(absl::string_view category) {
285 // Do not include "IteratorGetNext*" here, because IteratorGetNext is an Op
286 // that experiences the install stall, not an Op that causes the input stall.
287 return IsInfeedEnqueueOp(category) || IsDatasetOp(category) ||
288 IsMemcpyHToDOp(category);
289 }
290
291 // TODO(ckluk):
292 // Confirm with the tf.data team if the classification below is correct.
CategorizeInputOp(absl::string_view name,absl::string_view category)293 InputOpCategory CategorizeInputOp(absl::string_view name,
294 absl::string_view category) {
295 if (IsInfeedEnqueueOp(category) || IsMemcpyHToDOp(category)) {
296 // Ops for sending input from host to device.
297 return InputOpCategory::kEnqueue;
298 }
299 DCHECK(IsDatasetOp(category));
300 if (absl::EndsWith(name, "::TFRecord") ||
301 absl::EndsWith(name, "::TextLine") ||
302 absl::EndsWith(name, "::FixedLengthRecord") ||
303 absl::EndsWith(name, "::SSTable") || absl::EndsWith(name, "::RecordIO")) {
304 // Ops that read files.
305 if (absl::StrContains(name, "::MemoryReader") ||
306 absl::StrContains(name, "::MemoryWriter") ||
307 absl::StrContains(name, "::Interleave") ||
308 absl::StrContains(name, "::Prefetch") ||
309 absl::StrContains(name, "::ParallelMap")) {
310 // Ops that read files in advance, including caching, interleaving, and
311 // prefetching.
312 return InputOpCategory::kAdvancedFileRead;
313 } else {
314 // Ops that read files on demand.
315 return InputOpCategory::kDemandedFileRead;
316 }
317 } else {
318 // All other ops are classified as preprocessing.
319 return InputOpCategory::kPreprocessing;
320 }
321 }
322
323 struct InputOpMetrics {
324 std::vector<const OpMetrics*> input_op_metrics;
325 uint64 input_op_time_ps = 0;
326 };
327
SelectInputOpMetrics(const OpMetricsDb & all_op_metrics)328 InputOpMetrics SelectInputOpMetrics(const OpMetricsDb& all_op_metrics) {
329 InputOpMetrics input_op_metrics;
330 for (const OpMetrics* op_metrics : SortedOpMetricsDb(all_op_metrics)) {
331 if (IsInputOp(op_metrics->category())) {
332 input_op_metrics.input_op_metrics.push_back(op_metrics);
333 input_op_metrics.input_op_time_ps += op_metrics->self_time_ps();
334 }
335 }
336 return input_op_metrics;
337 }
338
ConvertOpMetricsToInputOpDetails(const OpMetrics & op_metrics,uint64 input_op_time_ps,InputOpCategory category)339 InputOpDetails ConvertOpMetricsToInputOpDetails(const OpMetrics& op_metrics,
340 uint64 input_op_time_ps,
341 InputOpCategory category) {
342 InputOpDetails details;
343 details.set_op_name(op_metrics.name());
344 details.set_count(op_metrics.occurrences());
345 details.set_time_in_ms(PicoToMilli(op_metrics.time_ps()));
346 details.set_self_time_in_ms(PicoToMilli(op_metrics.self_time_ps()));
347 details.set_time_in_percent(
348 100.0 * SafeDivide(op_metrics.time_ps(), input_op_time_ps));
349 details.set_self_time_in_percent(
350 100.0 * SafeDivide(op_metrics.self_time_ps(), input_op_time_ps));
351 details.set_category(InputOpCategoryString(category));
352 return details;
353 }
354
355 // Returns the ratio of the host-to-device time in each step to the step-time.
RatioOfHostToDeviceTimeToStepTime(const OpMetricsDb & host_tf_metrics_db,const InputPipelineAnalysisResult & input_pipeline_analysis)356 double RatioOfHostToDeviceTimeToStepTime(
357 const OpMetricsDb& host_tf_metrics_db,
358 const InputPipelineAnalysisResult& input_pipeline_analysis) {
359 // For TPU execution that uses infeed.
360 absl::optional<double> host_infeed_enqueue_ratio =
361 HostInfeedEnqueueRatio(host_tf_metrics_db);
362 if (host_infeed_enqueue_ratio.has_value()) {
363 return host_infeed_enqueue_ratio.value();
364 }
365 // For GPU and TPU execution that do not use infeed.
366 double avg_step_time_ms =
367 input_pipeline_analysis.step_time_summary().average();
368 if (avg_step_time_ms > 0) {
369 // Uses the on-device step time.
370 GenericStepTimeBreakdown generic_breakdown;
371 if (input_pipeline_analysis.step_time_breakdown().UnpackTo(
372 &generic_breakdown)) {
373 double avg_host_to_device_time_ms =
374 generic_breakdown.host_to_device_ms_summary().average();
375 return SafeDivide(avg_host_to_device_time_ms, avg_step_time_ms);
376 }
377 }
378 return 0.0;
379 }
380
DeviceCollectivesAnalysis(double device_collectives_percent,std::string * device_collectives_classification,std::string * device_collectives_statement)381 void DeviceCollectivesAnalysis(double device_collectives_percent,
382 std::string* device_collectives_classification,
383 std::string* device_collectives_statement) {
384 if (device_collectives_percent >=
385 kHighlyDeviceCollectivesBoundThresholdInPercent) {
386 *device_collectives_classification = "high";
387 *device_collectives_statement =
388 absl::StrCat(OneDigit(device_collectives_percent),
389 " % of the total step time sampled is spent on 'Device "
390 "Collective Communication'.");
391 } else if (device_collectives_percent >=
392 kModeratelyDeviceCollectivesBoundThresholdInPercent) {
393 *device_collectives_classification = "moderate";
394 *device_collectives_statement =
395 absl::StrCat(OneDigit(device_collectives_percent),
396 " % of the total step time sampled is spent on 'Device "
397 "Collective Communication'.");
398 } else {
399 *device_collectives_classification = "no";
400 *device_collectives_statement = "";
401 }
402 }
403
KernelLaunchAnalysis(bool tfdata_used,double kernel_launch_percent,std::string * kernel_launch_classification,std::string * kernel_launch_statement)404 void KernelLaunchAnalysis(bool tfdata_used, double kernel_launch_percent,
405 std::string* kernel_launch_classification,
406 std::string* kernel_launch_statement) {
407 if (kernel_launch_percent >= kHighlyKernelLaunchBoundThresholdInPercent) {
408 *kernel_launch_classification = "high";
409 *kernel_launch_statement = absl::StrCat(
410 OneDigit(kernel_launch_percent),
411 " % of the total step time sampled is spent on 'Kernel Launch'.");
412 if (tfdata_used) {
413 absl::StrAppend(kernel_launch_statement, kKernelLaunchTfDataContention);
414 }
415 } else if (kernel_launch_percent >=
416 kModeratelyKernelLaunchBoundThresholdInPercent) {
417 *kernel_launch_classification = "moderate";
418 *kernel_launch_statement = absl::StrCat(
419 OneDigit(kernel_launch_percent),
420 " % of the total step time sampled is spent on 'Kernel Launch'.");
421 if (tfdata_used) {
422 absl::StrAppend(kernel_launch_statement, kKernelLaunchTfDataContention);
423 }
424 } else {
425 *kernel_launch_classification = "no";
426 *kernel_launch_statement = "";
427 }
428 }
429
AllOtherAnalysis(bool all_other_reported,double all_other_percent,std::string * all_other_classification,std::string * all_other_statement)430 void AllOtherAnalysis(bool all_other_reported, double all_other_percent,
431 std::string* all_other_classification,
432 std::string* all_other_statement) {
433 if (all_other_reported) {
434 *all_other_classification = "no";
435 *all_other_statement = "";
436 return;
437 }
438 if (all_other_percent >= kHighlyAllOtherBoundThresholdInPercent) {
439 *all_other_classification = "high";
440 *all_other_statement =
441 absl::StrCat(OneDigit(all_other_percent), kAllOthersPythonExplanation);
442 } else if (all_other_percent >= kModeratelyAllOtherBoundThresholdInPercent) {
443 *all_other_classification = "moderate";
444 *all_other_statement =
445 absl::StrCat(OneDigit(all_other_percent), kAllOthersPythonExplanation);
446 } else {
447 *all_other_classification = "no";
448 *all_other_statement = "";
449 }
450 }
451
452 // Tests if tf.data API is in use.
TfDataInUse(const InputTimeBreakdown & breakdown)453 bool TfDataInUse(const InputTimeBreakdown& breakdown) {
454 // Do not include enqueue_us because the "enqueue" Op that Xprof recognizes is
455 // not part of tf.data.
456 return breakdown.demanded_file_read_us() > 0 ||
457 breakdown.advanced_file_read_us() > 0 ||
458 breakdown.preprocessing_us() > 0;
459 }
460
461 // Returns a HTML link with the given text.
MakeDocLink(absl::string_view doc_link,absl::string_view text)462 std::string MakeDocLink(absl::string_view doc_link, absl::string_view text) {
463 return absl::StrCat("<a href=\"", doc_link, "\" target=\"_blank\">", text,
464 "</a>");
465 }
466
467 // Returns the HTML link to the introduction to the tf.data API.
DatasetIntroDoc()468 std::string DatasetIntroDoc() {
469 return "https://www.tensorflow.org/guide/data";
470 }
471
472 } // namespace
473
GenerateHostResult(const OpMetricsDb & host_tf_metrics_db,InputPipelineAnalysisResult * result)474 void GenerateHostResult(const OpMetricsDb& host_tf_metrics_db,
475 InputPipelineAnalysisResult* result) {
476 InputOpMetrics input_op_metrics = SelectInputOpMetrics(host_tf_metrics_db);
477 // Returns if the program is not using an input pipeline with
478 // instrumentation and hence no input ops are found.
479 if (input_op_metrics.input_op_metrics.empty()) return;
480
481 absl::flat_hash_map<InputOpCategory, double> aggregated_input_op_times_us;
482 for (const OpMetrics* op_metrics : input_op_metrics.input_op_metrics) {
483 InputOpCategory category =
484 CategorizeInputOp(op_metrics->name(), op_metrics->category());
485 *result->add_input_op_details() = ConvertOpMetricsToInputOpDetails(
486 *op_metrics, input_op_metrics.input_op_time_ps, category);
487 aggregated_input_op_times_us[category] +=
488 PicoToMicro(op_metrics->self_time_ps());
489 }
490
491 double enqueue_time_us =
492 aggregated_input_op_times_us[InputOpCategory::kEnqueue];
493 double total_input_op_time_us =
494 aggregated_input_op_times_us[InputOpCategory::kDemandedFileRead] +
495 aggregated_input_op_times_us[InputOpCategory::kAdvancedFileRead] +
496 aggregated_input_op_times_us[InputOpCategory::kPreprocessing];
497
498 double ratio = std::min(
499 1.0, RatioOfHostToDeviceTimeToStepTime(host_tf_metrics_db, *result));
500 DCHECK_GE(ratio, 0.0);
501 double non_enqueue_time_us = (ratio != 0.0)
502 ? (enqueue_time_us * (1.0 - ratio) / ratio)
503 : total_input_op_time_us;
504
505 // Scales the various input-time components wrt to non_enqueue_time_us.
506 double scaled_demanded_fileread_time_us = SafeDivide(
507 non_enqueue_time_us *
508 aggregated_input_op_times_us[InputOpCategory::kDemandedFileRead],
509 total_input_op_time_us);
510 double scaled_advanced_fileread_time_us = SafeDivide(
511 non_enqueue_time_us *
512 aggregated_input_op_times_us[InputOpCategory::kAdvancedFileRead],
513 total_input_op_time_us);
514 double scaled_preprocessing_time_us = SafeDivide(
515 non_enqueue_time_us *
516 aggregated_input_op_times_us[InputOpCategory::kPreprocessing],
517 total_input_op_time_us);
518 double unclassified_non_enqueue_time_us = std::max(
519 0.0, non_enqueue_time_us - scaled_demanded_fileread_time_us -
520 scaled_advanced_fileread_time_us - scaled_preprocessing_time_us);
521
522 InputTimeBreakdown* input_time_breakdown =
523 result->mutable_input_time_breakdown();
524 input_time_breakdown->set_enqueue_us(enqueue_time_us);
525 input_time_breakdown->set_demanded_file_read_us(
526 scaled_demanded_fileread_time_us);
527 input_time_breakdown->set_advanced_file_read_us(
528 scaled_advanced_fileread_time_us);
529 input_time_breakdown->set_preprocessing_us(scaled_preprocessing_time_us);
530 input_time_breakdown->set_unclassified_non_enqueue_us(
531 unclassified_non_enqueue_time_us);
532 }
533
GenerateRecommendation()534 InputPipelineAnalysisRecommendation GenerateRecommendation() {
535 const absl::string_view kDatasetIntro =
536 "https://www.tensorflow.org/programmers_guide/datasets";
537
538 const absl::string_view kDatasetTopic =
539 "https://www.tensorflow.org/api_docs/python/tf/data/Dataset#";
540
541 const absl::string_view kTfRecordDataset =
542 "https://www.tensorflow.org/api_docs/python/tf/data/"
543 "TFRecordDataset#class_tfrecorddataset";
544
545 InputPipelineAnalysisRecommendation recommendation;
546 *recommendation.add_details() =
547 "Enqueuing data: you may want to combine small input data chunks "
548 "into fewer "
549 "but larger chunks.";
550 *recommendation.add_details() = absl::StrCat(
551 "Data preprocessing: you may increase num_parallel_calls in ",
552 AnchorElement(absl::StrCat(kDatasetTopic, "map"), "Dataset map()"),
553 " or preprocess the data OFFLINE.");
554 *recommendation.add_details() = absl::StrCat(
555 "Reading data from files in advance: you may tune parameters in the "
556 "following tf.data API (",
557 AnchorElement(absl::StrCat(kDatasetTopic, "prefetch"), "prefetch size"),
558 ", ",
559 AnchorElement(absl::StrCat(kDatasetTopic, "interleave"),
560 "interleave cycle_length"),
561 ", ", AnchorElement(kTfRecordDataset, "reader buffer_size"), ")");
562 *recommendation.add_details() = absl::StrCat(
563 "Reading data from files on demand: you should read data IN ADVANCE "
564 "using the following tf.data API (",
565 AnchorElement(absl::StrCat(kDatasetTopic, "prefetch"), "prefetch"), ", ",
566 AnchorElement(absl::StrCat(kDatasetTopic, "interleave"), "interleave"),
567 ", ", AnchorElement(kTfRecordDataset, "reader buffer"), ")");
568 *recommendation.add_details() = absl::StrCat(
569 "Other data reading or processing: you may consider using the ",
570 AnchorElement(kDatasetIntro, "tf.data API"),
571 " (if you are not using it now)");
572
573 return recommendation;
574 }
575
ComputeStepTimeSummaryInMs(const protobuf::RepeatedPtrField<PerCoreStepInfo> & grouped_by_step)576 StepSummary ComputeStepTimeSummaryInMs(
577 const protobuf::RepeatedPtrField<PerCoreStepInfo>& grouped_by_step) {
578 Stat<double> total_step_stats_in_ms;
579 // iterates over each step.
580 for (const auto& coreid_stepinfo_map : grouped_by_step) {
581 double max_per_step_stats_in_ms = 0.0;
582 // iterates over each core.
583 for (const auto& coreid_and_stepinfo :
584 coreid_stepinfo_map.step_info_per_core()) {
585 const auto& step_info = coreid_and_stepinfo.second;
586 max_per_step_stats_in_ms = std::max(step_info.duration_ps() / kNumPsPerMs,
587 max_per_step_stats_in_ms);
588 }
589 // Step time of each step is determined by the slowest core.
590 total_step_stats_in_ms.UpdateStat(max_per_step_stats_in_ms);
591 }
592
593 return GetStepSummaryForSampleStats(total_step_stats_in_ms);
594 }
595
ConvertOpStatsToInputPipelineAnalysis(const OpStats & op_stats)596 InputPipelineAnalysisResult ConvertOpStatsToInputPipelineAnalysis(
597 const OpStats& op_stats) {
598 InputPipelineAnalysisResult result =
599 ComputeGenericInputPipelineAnalysisResult(
600 op_stats.step_db().step_sequence());
601 PopulateStepDiagnostics(op_stats, result.mutable_diagnostics());
602 result.set_hardware_type(HardwareType_Name(
603 ParseHardwareType(op_stats.run_environment().device_type())));
604 GenerateHostResult(op_stats.host_op_metrics_db(), &result);
605
606 InputPipelineAnalysisRecommendation recommendation = GenerateRecommendation();
607 BottleneckAnalysis bottleneck_analysis = ComputeBottleneckAnalysis(
608 result.input_time_breakdown(), result.step_details());
609 result.set_input_percent(bottleneck_analysis.input_percent());
610 result.set_output_percent(bottleneck_analysis.output_percent());
611 result.set_idle_percent(bottleneck_analysis.idle_percent());
612 result.set_compute_percent(bottleneck_analysis.compute_percent());
613
614 recommendation.mutable_bottleneck_analysis()->PackFrom(bottleneck_analysis);
615 *recommendation.mutable_summary_next_step() =
616 GetSummaryNextStep(bottleneck_analysis.input_classification(),
617 result.input_time_breakdown());
618
619 *result.mutable_recommendation() = recommendation;
620 return result;
621 }
622
InputAnalysis(double input_percent,double all_other_percent,std::string * input_classification,std::string * input_statement)623 bool InputAnalysis(double input_percent, double all_other_percent,
624 std::string* input_classification,
625 std::string* input_statement) {
626 absl::string_view non_input_time = "other time";
627 if (input_percent >= kHighlyInfeedBoundThresholdInPercent) {
628 *input_classification = "host";
629 *input_statement = absl::StrCat(
630 "Your program is HIGHLY input-bound because ", OneDigit(input_percent),
631 "% of the total step time sampled is waiting for input. Therefore, you "
632 "should first focus on reducing the input time.");
633 return false;
634 } else if (input_percent >= kModeratelyInfeedBoundThresholdInPercent) {
635 *input_classification = "both";
636 *input_statement = absl::StrCat(
637 "Your program is MODERATELY input-bound because ",
638 OneDigit(input_percent),
639 "% of the total step time sampled is waiting for input. Therefore, "
640 "you would need to reduce both the input time and ",
641 non_input_time, ".");
642 return false;
643 } else if (all_other_percent >= kModeratelyAllOtherBoundThresholdInPercent) {
644 // Input analysis says it is not input-bound, but "All-Other" time
645 // is significant. It could still be input-bound (or Python overhead).
646 *input_classification = "both";
647 *input_statement = absl::StrCat(
648 "Your program is POTENTIALLY input-bound because ",
649 OneDigit(all_other_percent),
650 "% of the total step time sampled is spent on 'All Others' time (which "
651 "could be due to I/O or Python execution or both).");
652 return true;
653 } else {
654 // Defintely not input-bound.
655 *input_classification = "device";
656 *input_statement =
657 absl::StrCat("Your program is NOT input-bound because only ",
658 OneDigit(input_percent),
659 "% of the total step time sampled is waiting for "
660 "input. Therefore, you should focus on "
661 "reducing ",
662 non_input_time, ".");
663 return false;
664 }
665 }
666
OutputAnalysis(double output_percent,std::string * output_classification,std::string * output_statement)667 void OutputAnalysis(double output_percent, std::string* output_classification,
668 std::string* output_statement) {
669 if (output_percent >= kHighlyOutfeedBoundThresholdInPercent) {
670 *output_classification = "host";
671 *output_statement = absl::StrCat(
672 "Your program is HIGHLY output-bound because ",
673 OneDigit(output_percent),
674 "% of the total step time sampled is spent on output. Therefore, you "
675 "should first focus on reducing the output time.");
676 } else if (output_percent >= kModeratelyOutfeedBoundThresholdInPercent) {
677 *output_classification = "both";
678 *output_statement = absl::StrCat(
679 "Your program is MODERATELY output-bound because ",
680 OneDigit(output_percent),
681 "% of the total step time sampled is spent on output. Therefore, "
682 "you would need to reduce both the output time and other time.");
683 } else {
684 *output_classification = "device";
685 *output_statement = "";
686 }
687 }
688
ComputeBottleneckAnalysis(const InputTimeBreakdown & input_time_breakdown,const::tensorflow::protobuf::RepeatedPtrField<::google::protobuf::Any> & any_step_details)689 BottleneckAnalysis ComputeBottleneckAnalysis(
690 const InputTimeBreakdown& input_time_breakdown,
691 const ::tensorflow::protobuf::RepeatedPtrField<::google::protobuf::Any>&
692 any_step_details) {
693 double total_step_time_ms = 0;
694 double total_input_ms = 0;
695 double total_output_ms = 0;
696 double total_host_compute_ms = 0;
697 double total_host_prepare_ms = 0;
698 double total_host_compile_ms = 0;
699 double total_device_compute_ms = 0;
700 double total_device_to_device_ms = 0;
701 double total_device_collectives_ms = 0;
702 double total_unknown_ms = 0;
703
704 for (const google::protobuf::Any& step_details : any_step_details) {
705 PerGenericStepDetails details;
706 bool success = step_details.UnpackTo(&details);
707 if (!success && !step_details.type_url().empty()) {
708 LOG(ERROR) << "Unable to unpack step_breakdown. Expected: generic"
709 << std::endl;
710 return {};
711 }
712 total_step_time_ms += details.step_time_ms();
713 total_input_ms +=
714 details.host_wait_input_ms() + details.host_to_device_ms();
715 total_output_ms += details.output_ms();
716 total_host_prepare_ms += details.host_prepare_ms();
717 total_device_compute_ms += details.device_compute_ms();
718 total_device_to_device_ms += details.device_to_device_ms();
719 total_device_collectives_ms += details.device_collectives_ms();
720 total_host_compute_ms += details.host_compute_ms();
721 total_host_compile_ms += details.host_compile_ms();
722 total_unknown_ms += details.unknown_time_ms();
723 }
724
725 if (total_step_time_ms == 0) {
726 BottleneckAnalysis analysis;
727 analysis.set_input_classification("unknown");
728 analysis.set_input_statement(
729 "No step time measured. Therefore we cannot tell where the "
730 "performance bottleneck is.");
731 analysis.set_kernel_launch_classification("no");
732 analysis.set_kernel_launch_statement("");
733 analysis.set_all_other_classification("no");
734 analysis.set_all_other_statement("");
735 analysis.set_device_collectives_classification("no");
736 analysis.set_device_collectives_statement("");
737 return analysis;
738 }
739 double input_percent = 100.0 * total_input_ms / total_step_time_ms;
740 double output_percent = 100.0 * total_output_ms / total_step_time_ms;
741 double compute_percent = 100.0 * total_device_compute_ms / total_step_time_ms;
742 double device_collectives_percent =
743 100.0 * total_device_collectives_ms / total_step_time_ms;
744
745 // idle_percent includes host_prepare (i.e. kernel launch, device-to-device,
746 // host compute, host compile, and unknown.
747 double idle_percent =
748 std::max(0.0, 100.0 - input_percent - output_percent - compute_percent -
749 device_collectives_percent);
750 double kernel_launch_percent =
751 100.0 * total_host_prepare_ms / total_step_time_ms;
752 double all_other_percent = 100.0 * total_unknown_ms / total_step_time_ms;
753
754 std::string input_classification;
755 std::string input_statement;
756 bool all_other_reported =
757 InputAnalysis(input_percent, all_other_percent, &input_classification,
758 &input_statement);
759
760 std::string device_collectives_classification;
761 std::string device_collectives_statement;
762 DeviceCollectivesAnalysis(device_collectives_percent,
763 &device_collectives_classification,
764 &device_collectives_statement);
765
766 std::string kernel_launch_classification;
767 std::string kernel_launch_statement;
768 KernelLaunchAnalysis(TfDataInUse(input_time_breakdown), kernel_launch_percent,
769 &kernel_launch_classification, &kernel_launch_statement);
770
771 std::string all_other_classification;
772 std::string all_other_statement;
773 AllOtherAnalysis(all_other_reported, all_other_percent,
774 &all_other_classification, &all_other_statement);
775
776 BottleneckAnalysis analysis;
777 analysis.set_input_percent(input_percent);
778 analysis.set_output_percent(output_percent);
779 analysis.set_idle_percent(idle_percent);
780 analysis.set_compute_percent(compute_percent);
781
782 analysis.set_input_classification(input_classification);
783 analysis.set_input_statement(input_statement);
784 analysis.set_kernel_launch_classification(kernel_launch_classification);
785 analysis.set_kernel_launch_statement(kernel_launch_statement);
786 analysis.set_all_other_classification(all_other_classification);
787 analysis.set_all_other_statement(all_other_statement);
788 analysis.set_device_collectives_classification(
789 device_collectives_classification);
790 analysis.set_device_collectives_statement(device_collectives_statement);
791
792 return analysis;
793 }
794
GetSummaryNextStep(absl::string_view input_classification,const InputTimeBreakdown & breakdown)795 std::string GetSummaryNextStep(absl::string_view input_classification,
796 const InputTimeBreakdown& breakdown) {
797 std::string summary_next_step;
798 if (input_classification == "host" || input_classification == "both") {
799 if (!TfDataInUse(breakdown)) {
800 summary_next_step = absl::StrCat(
801 "Consider using ", MakeDocLink(DatasetIntroDoc(), "the tf.data API"),
802 " to enable profiler's host-side analysis for input pipeline. "
803 "Profiler currently does not support custom input pipeline (please "
804 "ignore "
805 "Section ",
806 kHostAnalysisSectionNumber, " below).");
807 } else {
808 summary_next_step =
809 absl::StrCat("Look at Section ", kHostAnalysisSectionNumber,
810 " for the breakdown of input time on the host.");
811 }
812 } else {
813 summary_next_step = "You may skip the rest of this page.";
814 }
815
816 return summary_next_step;
817 }
818
HostToDeviceTransferAsPercentOfInputTime(const InputTimeBreakdown & breakdown)819 double HostToDeviceTransferAsPercentOfInputTime(
820 const InputTimeBreakdown& breakdown) {
821 // Thanks to the scaling trick we did in GenerateHostResult(), we can
822 // estimate the percentage of input-time spent on host-to-device transfer in
823 // the following way.
824 double total_input_time_us =
825 breakdown.demanded_file_read_us() + breakdown.advanced_file_read_us() +
826 breakdown.preprocessing_us() + breakdown.enqueue_us() +
827 breakdown.unclassified_non_enqueue_us();
828 return 100.0 * SafeDivide(breakdown.enqueue_us(), total_input_time_us);
829 }
830
831 } // namespace profiler
832 } // namespace tensorflow
833