1syntax = "proto3"; 2 3package tensorflow.tpu; 4 5import "tensorflow/core/protobuf/tpu/optimization_parameters.proto"; 6 7message TPUEmbeddingConfiguration { 8 // Description of the various embedding tables. 9 message TableDescriptor { 10 // Name of the table. 11 string name = 1; 12 13 // Size of the vocabulary (i.e., number of rows) in the table. 14 int64 vocabulary_size = 2; 15 16 // The embedding dimension (i.e., the width of the embedding table). 17 int32 dimension = 3; 18 19 // Number of features mapped to this table. 20 int32 num_features = 4; 21 22 // Details of the learning algorithm used to update the embedding 23 // parameters. 24 OptimizationParameters optimization_parameters = 5; 25 } 26 repeated TableDescriptor table_descriptor = 1; 27 28 // Mode. Should the embedding layer program be run for inference (just forward 29 // pass), training (both forward and backward pass) or just the backward_pass. 30 enum Mode { 31 UNSPECIFIED = 0; 32 INFERENCE = 1; 33 TRAINING = 2; 34 BACKWARD_PASS_ONLY = 3; 35 } 36 Mode mode = 2; 37 38 // Number of samples in each batch of embedding layer activations sent to 39 // the TensorCore. 40 int32 batch_size_per_tensor_core = 3; 41 42 // Number of TPU hosts used for inference/training. 43 int32 num_hosts = 4; 44 45 // Number of TensorCore used for inference/training. 46 int32 num_tensor_cores = 5; 47 48 // Sharding strategy of the embedding tables among the hosts. 49 // If the sharding_strategy is "mod", each id is assigned to host 50 // "id % num_hosts". For instance, 13 ids are split across 5 hosts as: 51 // [[0, 5, 10], [1, 6, 11], [2, 7, 12], [3, 8], [4, 9]]. 52 // If the sharding_strategy is "div", ids are assigned to hosts in a 53 // contiguous manner. In this case, 13 ids are split across 5 hosts as: 54 // [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10], [11, 12]]. 55 // In both the strategies, if the id space does not evenly divide the number 56 // of hosts, each of the first "table_descriptor.vocabulary_size % num_hosts" 57 // hosts will be assigned one more id. 58 // This partitioning strategy exactly follows that in the embedding_lookup 59 // TensorFlow function at tensorflow/python/ops/embedding_ops.py. 60 enum ShardingStrategy { 61 DIV_DEFAULT = 0; 62 MOD = 1; 63 } 64 ShardingStrategy sharding_strategy = 6; 65 66 // This parameter determines if the execution of the sparse core will be 67 // pipelined with that of the TensorCore. This parameter only affects results 68 // when mode=TRAINING. If mode=INFERENCE or BACKWARD_PASS_ONLY, this parameter 69 // does not affect execution and hence, is a don't care value. 70 // 71 // false: The execution of the sparse core is not pipelined with that of the 72 // TensorCore. The forward pass of every step on the sparse core is executed 73 // only after the backward pass of the previous step is complete. And the 74 // backward pass on the sparse core is executed only after the embedding 75 // gradients have been computed on the TensorCore on every step. This ensures 76 // that the activations on every step observe the gradient updates from the 77 // previous step on both the sparse core and the TensorCore. 78 // 79 // true: The execution of the sparse core is pipelined with that of the 80 // TensorCore. The forward pass of every step on the sparse core can be 81 // executed after the forward pass of the previous step is complete without 82 // waiting for the backward pass. This improves the utilization of the sparse 83 // core allowing it to process step N+1 while the embedding gradients for step 84 // N are computed on the TensorCore. The backward pass of every step on the 85 // sparse core is executed directly after the forward pass for the next step 86 // is complete. The drawback is that embedding activations for step N+1 do not 87 // observe the embedding gradient updates from step N. This could affect model 88 // quality if step N and N+1 involve the same set of embedding IDs. However, 89 // since the embedding updates are sparse, this is generally not considered a 90 // problem. 91 bool pipeline_execution_with_tensor_core = 7; 92 93 // Directory where embedding lookup statistics are stored. These statistics 94 // summarize information about the inputs to the embedding lookup 95 // operation, in particular, the average number of embedding IDs per example 96 // and how well the embedding IDs are load balanced across the system. The 97 // lookup statistics are used during TPU initialization for embedding table 98 // partitioning. Collection of lookup statistics is done at runtime by 99 // profiling the embedding inputs: only 3% of input samples are profiled to 100 // minimize host CPU overhead. Once a suitable number of samples are 101 // profiled, the lookup statistics are saved to table-specific files in the 102 // profile data directory generally at the end of a TPU training loop. The 103 // filename corresponding to each table is obtained by hashing table specific 104 // parameters (e.g., table name and number of features) and global 105 // configuration parameters (e.g., sharding strategy and TPU worker task 106 // count). The same profile data directory can be shared amongst several 107 // models to reuse embedding lookup statistics. 108 string profile_data_directory = 9; 109 110 // Description of different input features. 111 message FeatureDescriptor { 112 // Name of the input feature. 113 string name = 1; 114 115 // Index of the corresponding table in the TableDescriptor list. 116 int32 table_id = 2; 117 118 // Static shape of the inputs (excluding the reduction axis). Note that 119 // the shape of the actual inputs provided using the infeed op must be 120 // strictly smaller than input_shape. The outputs received at the TensorCore 121 // will have rank = input_shape.size() + 1. The innermost axis corresponds 122 // to the embedding dimension. If the input has shape [m, n, k] (excluding 123 // the reduction axis) and the embedding dimension is d, the output received 124 // at the TensorCore will have shape [m, n, k, d]. 125 repeated int32 input_shape = 3; 126 } 127 repeated FeatureDescriptor feature_descriptor = 10; 128 129 // SPMD (Single Program Multiple Data) sharding configuration for 130 // TPUEmbedding. When model parallelism is used on the TensorCore, the number 131 // of cores per replica must be passed to TPUEmbedding so that the right 132 // shapes can be computed in the TF/XLA bridge. 133 message SpmdSharding { 134 // Whether SPMD sharding is enabled. 135 bool enabled = 1; 136 137 // Number of cores per replica. 138 int32 num_cores_per_replica = 2; 139 } 140 SpmdSharding spmd_sharding = 11; 141 142 // Old TPU embedding output layout. 143 reserved "output_layout"; 144 reserved 8; 145} 146 147// A placeholder message that is used to define a unique Status payload 148// URL for TPU embedding errors. 149message TPUEmbeddingError {} 150