xref: /aosp_15_r20/external/tensorflow/tensorflow/core/protobuf/tpu/tpu_embedding_configuration.proto (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1syntax = "proto3";
2
3package tensorflow.tpu;
4
5import "tensorflow/core/protobuf/tpu/optimization_parameters.proto";
6
7message TPUEmbeddingConfiguration {
8  // Description of the various embedding tables.
9  message TableDescriptor {
10    // Name of the table.
11    string name = 1;
12
13    // Size of the vocabulary (i.e., number of rows) in the table.
14    int64 vocabulary_size = 2;
15
16    // The embedding dimension (i.e., the width of the embedding table).
17    int32 dimension = 3;
18
19    // Number of features mapped to this table.
20    int32 num_features = 4;
21
22    // Details of the learning algorithm used to update the embedding
23    // parameters.
24    OptimizationParameters optimization_parameters = 5;
25  }
26  repeated TableDescriptor table_descriptor = 1;
27
28  // Mode. Should the embedding layer program be run for inference (just forward
29  // pass), training (both forward and backward pass) or just the backward_pass.
30  enum Mode {
31    UNSPECIFIED = 0;
32    INFERENCE = 1;
33    TRAINING = 2;
34    BACKWARD_PASS_ONLY = 3;
35  }
36  Mode mode = 2;
37
38  // Number of samples in each batch of embedding layer activations sent to
39  // the TensorCore.
40  int32 batch_size_per_tensor_core = 3;
41
42  // Number of TPU hosts used for inference/training.
43  int32 num_hosts = 4;
44
45  // Number of TensorCore used for inference/training.
46  int32 num_tensor_cores = 5;
47
48  // Sharding strategy of the embedding tables among the hosts.
49  // If the sharding_strategy is "mod", each id is assigned to host
50  // "id % num_hosts". For instance, 13 ids are split across 5 hosts as:
51  // [[0, 5, 10], [1, 6, 11], [2, 7, 12], [3, 8], [4, 9]].
52  // If the sharding_strategy is "div", ids are assigned to hosts in a
53  // contiguous manner. In this case, 13 ids are split across 5 hosts as:
54  // [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10], [11, 12]].
55  // In both the strategies, if the id space does not evenly divide the number
56  // of hosts, each of the first "table_descriptor.vocabulary_size % num_hosts"
57  // hosts will be assigned one more id.
58  // This partitioning strategy exactly follows that in the embedding_lookup
59  // TensorFlow function at tensorflow/python/ops/embedding_ops.py.
60  enum ShardingStrategy {
61    DIV_DEFAULT = 0;
62    MOD = 1;
63  }
64  ShardingStrategy sharding_strategy = 6;
65
66  // This parameter determines if the execution of the sparse core will be
67  // pipelined with that of the TensorCore. This parameter only affects results
68  // when mode=TRAINING. If mode=INFERENCE or BACKWARD_PASS_ONLY, this parameter
69  // does not affect execution and hence, is a don't care value.
70  //
71  // false: The execution of the sparse core is not pipelined with that of the
72  // TensorCore. The forward pass of every step on the sparse core is executed
73  // only after the backward pass of the previous step is complete. And the
74  // backward pass on the sparse core is executed only after the embedding
75  // gradients have been computed on the TensorCore on every step. This ensures
76  // that the activations on every step observe the gradient updates from the
77  // previous step on both the sparse core and the TensorCore.
78  //
79  // true: The execution of the sparse core is pipelined with that of the
80  // TensorCore. The forward pass of every step on the sparse core can be
81  // executed after the forward pass of the previous step is complete without
82  // waiting for the backward pass. This improves the utilization of the sparse
83  // core allowing it to process step N+1 while the embedding gradients for step
84  // N are computed on the TensorCore. The backward pass of every step on the
85  // sparse core is executed directly after the forward pass for the next step
86  // is complete. The drawback is that embedding activations for step N+1 do not
87  // observe the embedding gradient updates from step N. This could affect model
88  // quality if step N and N+1 involve the same set of embedding IDs. However,
89  // since the embedding updates are sparse, this is generally not considered a
90  // problem.
91  bool pipeline_execution_with_tensor_core = 7;
92
93  // Directory where embedding lookup statistics are stored. These statistics
94  // summarize information about the inputs to the embedding lookup
95  // operation, in particular, the average number of embedding IDs per example
96  // and how well the embedding IDs are load balanced across the system. The
97  // lookup statistics are used during TPU initialization for embedding table
98  // partitioning. Collection of lookup statistics is done at runtime by
99  // profiling the embedding inputs: only 3% of input samples are profiled to
100  // minimize host CPU overhead. Once a suitable number of samples are
101  // profiled, the lookup statistics are saved to table-specific files in the
102  // profile data directory generally at the end of a TPU training loop. The
103  // filename corresponding to each table is obtained by hashing table specific
104  // parameters (e.g., table name and number of features) and global
105  // configuration parameters (e.g., sharding strategy and TPU worker task
106  // count). The same profile data directory can be shared amongst several
107  // models to reuse embedding lookup statistics.
108  string profile_data_directory = 9;
109
110  // Description of different input features.
111  message FeatureDescriptor {
112    // Name of the input feature.
113    string name = 1;
114
115    // Index of the corresponding table in the TableDescriptor list.
116    int32 table_id = 2;
117
118    // Static shape of the inputs (excluding the reduction axis). Note that
119    // the shape of the actual inputs provided using the infeed op must be
120    // strictly smaller than input_shape. The outputs received at the TensorCore
121    // will have rank = input_shape.size() + 1. The innermost axis corresponds
122    // to the embedding dimension. If the input has shape [m, n, k] (excluding
123    // the reduction axis) and the embedding dimension is d, the output received
124    // at the TensorCore will have shape [m, n, k, d].
125    repeated int32 input_shape = 3;
126  }
127  repeated FeatureDescriptor feature_descriptor = 10;
128
129  // SPMD (Single Program Multiple Data) sharding configuration for
130  // TPUEmbedding. When model parallelism is used on the TensorCore, the number
131  // of cores per replica must be passed to TPUEmbedding so that the right
132  // shapes can be computed in the TF/XLA bridge.
133  message SpmdSharding {
134    // Whether SPMD sharding is enabled.
135    bool enabled = 1;
136
137    // Number of cores per replica.
138    int32 num_cores_per_replica = 2;
139  }
140  SpmdSharding spmd_sharding = 11;
141
142  // Old TPU embedding output layout.
143  reserved "output_layout";
144  reserved 8;
145}
146
147// A placeholder message that is used to define a unique Status payload
148// URL for TPU embedding errors.
149message TPUEmbeddingError {}
150