xref: /aosp_15_r20/external/federated-compute/fcp/client/histogram_counters.proto (revision 14675a029014e728ec732f129a32e299b2da0601)
1/*
2 * Copyright 2020 Google LLC
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16syntax = "proto3";
17
18package fcp.client;
19
20option java_package = "com.google.intelligence.fcp.client";
21option java_multiple_files = true;
22
23/**
24 * Enumerations of timer and counter identifiers.
25 *
26 * For monitoring, certain timers and counters are logged as integer histograms.
27 * This allows for computing aggregate histograms on the cloud and determine
28 * distributions of latencies for blocks of code, resource usage etc.
29 */
30enum HistogramCounters {
31  HISTOGRAM_COUNTER_UNDEFINED = 0;
32
33  /**
34   * How long it takes to run a plan on device, excluding downloading the plan
35   * and reporting results.
36   */
37  TRAINING_RUN_PHASE_LATENCY = 1;
38
39  /**
40   * The end time of running training for a whole plan, excluding downloading
41   * the plan and reporting results, relative to the start of the training
42   * session.
43   */
44  TRAINING_RUN_PHASE_END_TIME = 2;
45
46  /** How long running a "restore state op" takes. */
47  TRAINING_RESTORE_STATE_LATENCY = 3;
48
49  /**
50   * How long it takes to run training for a whole client execution (which may
51   * involve running multiple epochs). This includes connecting and fetching
52   * example from the example store, as well as training over them.
53   */
54  TRAINING_RUN_CLIENT_EXECUTION_LATENCY = 4;
55
56  /** How long running an "init op" takes. */
57  TRAINING_INIT_OP_LATENCY = 5;
58
59  /** How long running a "before op" takes. */
60  TRAINING_BEFORE_OP_LATENCY = 6;
61
62  /** How long running an "after op" takes. */
63  TRAINING_AFTER_OP_LATENCY = 7;
64
65  /**
66   * How long it takes to run training for a whole epoch. This includes
67   * connecting and fetching example from the example store, as well as training
68   * over them.
69   */
70  TRAINING_RUN_EPOCH_LATENCY = 8;
71
72  /**
73   * How long it takes to gather enough examples for a mini batch.
74   * This counter may be an average across minibatches and epochs.
75   */
76  TRAINING_GATHER_MINI_BATCH_LATENCY = 9;
77
78  /**
79   * How long it takes to run training on a mini batch.
80   * This counter may be an average across minibatches and epochs.
81   */
82  TRAINING_RUN_MINI_BATCH_LATENCY = 10;
83
84  /**
85   * How long it takes the TensorFlow session to terminate after it's been
86   * interrupted.
87   */
88  TRAINING_INTERRUPT_TERMINATION_LATENCY = 11;
89
90  /** How long it takes to commit the opstats message to the database. */
91  TRAINING_OPSTATS_COMMIT_LATENCY = 12;
92
93  /** The number of examples encountered during overall training, across all
94   * client executions. */
95  TRAINING_OVERALL_EXAMPLE_COUNT = 100001;
96
97  /**
98   * The sum of the size (in bytes) of all the examples encountered during
99   * overall training, across all client executions.
100   */
101  TRAINING_OVERALL_EXAMPLE_SIZE = 100002;
102
103  /**
104   * The number of examples encountered in a client execution, across all
105   * epochs.
106   */
107  TRAINING_CLIENT_EXECUTION_EXAMPLE_COUNT = 100003;
108
109  /**
110   * The sum of the size (in bytes) of all the examples encountered in a client
111   * execution, across all epoch.
112   */
113  TRAINING_CLIENT_EXECUTION_EXAMPLE_SIZE = 100004;
114
115  /**
116   * The number of examples encountered in an epoch.
117   * This counter may be an average from multiple epochs.
118   */
119  TRAINING_EPOCH_EXAMPLE_COUNT = 100005;
120
121  /**
122   * The sum of the size (in bytes) of all the examples encountered in an
123   * epoch. This counter may be an average from multiple epochs
124   */
125  TRAINING_EPOCH_EXAMPLE_SIZE = 100006;
126
127  /**
128   * The number of examples in a mini batch.
129   * This counter may be an average from multiple minibatches.
130   */
131  TRAINING_MINI_BATCH_EXAMPLE_COUNT = 100007;
132
133  /**
134   * The sum of the size (in bytes) of all the examples in a mini batch.
135   * This counter may be an average from multiple minibatches.
136   */
137  TRAINING_MINI_BATCH_EXAMPLE_SIZE = 100008;
138
139  /**
140   * The size (in bytes) of the OpStatsDb file.
141   */
142  OPSTATS_DB_SIZE_BYTES = 100009;
143
144  /**
145   * The number of entries in OpStatsDb.
146   */
147  OPSTATS_DB_NUM_ENTRIES = 100010;
148
149  /**
150   * The number of entries pruned from OpStatsDb due to exceeding max size.
151   */
152  OPSTATS_NUM_PRUNED_ENTRIES = 100011;
153
154  /**
155   * The tenure (in hours) of the oldest entry which has been pruned from the
156   * OpStatsDb due to exceeding max size.
157   */
158  OPSTATS_OLDEST_PRUNED_ENTRY_TENURE_HOURS = 100012;
159
160  /** How long checking in/downloading a plan takes (for FL plans only). */
161  TRAINING_FL_CHECKIN_LATENCY = 200001;
162
163  /**
164   * The end time of reporting results to the server, relative to the start
165   * of the training session.
166   */
167  TRAINING_FL_REPORT_RESULTS_END_TIME = 200002;
168
169  /** How long reporting results to the server takes. */
170  TRAINING_FL_REPORT_RESULTS_LATENCY = 200003;
171
172  /** The end time of checking in/downloading a plan from the server, relative
173  to the start of the training session. */
174  TRAINING_FL_CHECKIN_END_TIME = 200004;
175
176  /** How long reporting results to the server takes. */
177  TRAINING_FL_ELIGIBILITY_EVAL_CHECKIN_LATENCY = 200005;
178}
179