1/* 2 * Copyright 2020 Google LLC 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16syntax = "proto3"; 17 18package fcp.client; 19 20option java_package = "com.google.intelligence.fcp.client"; 21option java_multiple_files = true; 22 23/** 24 * Enumerations of timer and counter identifiers. 25 * 26 * For monitoring, certain timers and counters are logged as integer histograms. 27 * This allows for computing aggregate histograms on the cloud and determine 28 * distributions of latencies for blocks of code, resource usage etc. 29 */ 30enum HistogramCounters { 31 HISTOGRAM_COUNTER_UNDEFINED = 0; 32 33 /** 34 * How long it takes to run a plan on device, excluding downloading the plan 35 * and reporting results. 36 */ 37 TRAINING_RUN_PHASE_LATENCY = 1; 38 39 /** 40 * The end time of running training for a whole plan, excluding downloading 41 * the plan and reporting results, relative to the start of the training 42 * session. 43 */ 44 TRAINING_RUN_PHASE_END_TIME = 2; 45 46 /** How long running a "restore state op" takes. */ 47 TRAINING_RESTORE_STATE_LATENCY = 3; 48 49 /** 50 * How long it takes to run training for a whole client execution (which may 51 * involve running multiple epochs). This includes connecting and fetching 52 * example from the example store, as well as training over them. 53 */ 54 TRAINING_RUN_CLIENT_EXECUTION_LATENCY = 4; 55 56 /** How long running an "init op" takes. */ 57 TRAINING_INIT_OP_LATENCY = 5; 58 59 /** How long running a "before op" takes. */ 60 TRAINING_BEFORE_OP_LATENCY = 6; 61 62 /** How long running an "after op" takes. */ 63 TRAINING_AFTER_OP_LATENCY = 7; 64 65 /** 66 * How long it takes to run training for a whole epoch. This includes 67 * connecting and fetching example from the example store, as well as training 68 * over them. 69 */ 70 TRAINING_RUN_EPOCH_LATENCY = 8; 71 72 /** 73 * How long it takes to gather enough examples for a mini batch. 74 * This counter may be an average across minibatches and epochs. 75 */ 76 TRAINING_GATHER_MINI_BATCH_LATENCY = 9; 77 78 /** 79 * How long it takes to run training on a mini batch. 80 * This counter may be an average across minibatches and epochs. 81 */ 82 TRAINING_RUN_MINI_BATCH_LATENCY = 10; 83 84 /** 85 * How long it takes the TensorFlow session to terminate after it's been 86 * interrupted. 87 */ 88 TRAINING_INTERRUPT_TERMINATION_LATENCY = 11; 89 90 /** How long it takes to commit the opstats message to the database. */ 91 TRAINING_OPSTATS_COMMIT_LATENCY = 12; 92 93 /** The number of examples encountered during overall training, across all 94 * client executions. */ 95 TRAINING_OVERALL_EXAMPLE_COUNT = 100001; 96 97 /** 98 * The sum of the size (in bytes) of all the examples encountered during 99 * overall training, across all client executions. 100 */ 101 TRAINING_OVERALL_EXAMPLE_SIZE = 100002; 102 103 /** 104 * The number of examples encountered in a client execution, across all 105 * epochs. 106 */ 107 TRAINING_CLIENT_EXECUTION_EXAMPLE_COUNT = 100003; 108 109 /** 110 * The sum of the size (in bytes) of all the examples encountered in a client 111 * execution, across all epoch. 112 */ 113 TRAINING_CLIENT_EXECUTION_EXAMPLE_SIZE = 100004; 114 115 /** 116 * The number of examples encountered in an epoch. 117 * This counter may be an average from multiple epochs. 118 */ 119 TRAINING_EPOCH_EXAMPLE_COUNT = 100005; 120 121 /** 122 * The sum of the size (in bytes) of all the examples encountered in an 123 * epoch. This counter may be an average from multiple epochs 124 */ 125 TRAINING_EPOCH_EXAMPLE_SIZE = 100006; 126 127 /** 128 * The number of examples in a mini batch. 129 * This counter may be an average from multiple minibatches. 130 */ 131 TRAINING_MINI_BATCH_EXAMPLE_COUNT = 100007; 132 133 /** 134 * The sum of the size (in bytes) of all the examples in a mini batch. 135 * This counter may be an average from multiple minibatches. 136 */ 137 TRAINING_MINI_BATCH_EXAMPLE_SIZE = 100008; 138 139 /** 140 * The size (in bytes) of the OpStatsDb file. 141 */ 142 OPSTATS_DB_SIZE_BYTES = 100009; 143 144 /** 145 * The number of entries in OpStatsDb. 146 */ 147 OPSTATS_DB_NUM_ENTRIES = 100010; 148 149 /** 150 * The number of entries pruned from OpStatsDb due to exceeding max size. 151 */ 152 OPSTATS_NUM_PRUNED_ENTRIES = 100011; 153 154 /** 155 * The tenure (in hours) of the oldest entry which has been pruned from the 156 * OpStatsDb due to exceeding max size. 157 */ 158 OPSTATS_OLDEST_PRUNED_ENTRY_TENURE_HOURS = 100012; 159 160 /** How long checking in/downloading a plan takes (for FL plans only). */ 161 TRAINING_FL_CHECKIN_LATENCY = 200001; 162 163 /** 164 * The end time of reporting results to the server, relative to the start 165 * of the training session. 166 */ 167 TRAINING_FL_REPORT_RESULTS_END_TIME = 200002; 168 169 /** How long reporting results to the server takes. */ 170 TRAINING_FL_REPORT_RESULTS_LATENCY = 200003; 171 172 /** The end time of checking in/downloading a plan from the server, relative 173 to the start of the training session. */ 174 TRAINING_FL_CHECKIN_END_TIME = 200004; 175 176 /** How long reporting results to the server takes. */ 177 TRAINING_FL_ELIGIBILITY_EVAL_CHECKIN_LATENCY = 200005; 178} 179