xref: /aosp_15_r20/external/tensorflow/tensorflow/core/data/service/dispatcher.proto (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1syntax = "proto3";
2
3package tensorflow.data;
4
5import "tensorflow/core/data/service/common.proto";
6import "tensorflow/core/framework/tensor.proto";
7import "tensorflow/core/protobuf/data_service.proto";
8
9// Next tag: 3
10message TaskProgress {
11  // The task that this message is about.
12  int64 task_id = 1;
13  // Whether the task has completed.
14  bool completed = 2;
15}
16
17// Next tag: 6
18message WorkerHeartbeatRequest {
19  string worker_address = 1;
20  string transfer_address = 3;
21  repeated string worker_tags = 4;
22  // The UID of the worker Borg job, used for telemetry.
23  int64 worker_uid = 5;
24  repeated int64 current_tasks = 2;
25}
26
27// Next tag: 3
28message WorkerHeartbeatResponse {
29  repeated TaskDef new_tasks = 1;
30  repeated int64 tasks_to_delete = 2;
31}
32
33// Next tag: 3
34message WorkerUpdateRequest {
35  string worker_address = 1;
36  repeated TaskProgress updates = 2;
37}
38
39// Next tag: 1
40message WorkerUpdateResponse {}
41
42// Next tag: 2
43message GetDatasetDefRequest {
44  string dataset_id = 1;
45}
46
47// Next tag: 2
48message GetDatasetDefResponse {
49  DatasetDef dataset_def = 1;
50}
51
52// Next tag: 4
53message GetSplitRequest {
54  int64 iteration_id = 1;
55  int64 repetition = 2;
56  int64 split_provider_index = 3;
57}
58
59// Next tag: 3
60message GetSplitResponse {
61  TensorProto split = 1;
62  bool end_of_splits = 2;
63}
64
65// Next tag: 1
66message GetVersionRequest {}
67
68// Next tag: 2
69message GetVersionResponse {
70  int64 version = 1;
71}
72
73// Next tag: 5
74message GetOrRegisterDatasetRequest {
75  // The dataset to register.
76  DatasetDef dataset = 1;
77  // Metadata related to tf.data service.
78  DataServiceMetadata metadata = 3;
79  oneof optional_dataset_id {
80    // If provided, tf.data service will register the dataset with the specified
81    // ID. Otherwise, it will generate a unique dataset ID.
82    string dataset_id = 4;
83  }
84  reserved 2;
85}
86
87// Next tag: 2
88message GetOrRegisterDatasetResponse {
89  // The id for the registered dataset.
90  string dataset_id = 1;
91}
92
93// Next tag: 2
94message GetDataServiceMetadataRequest {
95  // The dataset id to get the data service dataset metadata.
96  string dataset_id = 1;
97}
98
99// Next tag: 2
100message GetDataServiceMetadataResponse {
101  // The retrieved data service dataset metadata.
102  DataServiceMetadata metadata = 1;
103}
104// Next tag: 1
105message GetDataServiceConfigRequest {}
106
107// Next tag: 2
108message GetDataServiceConfigResponse {
109  DataServiceConfig config = 1;
110}
111
112// Next tag: 7
113message GetOrCreateJobRequest {
114  // The id of the dataset to create a job for.
115  string dataset_id = 1;
116  // A mode controlling how the tf.data service produces data for the job.
117  ProcessingModeDef processing_mode_def = 2;
118  // Optional job name identifying a shared job. If not set, the RPC will always
119  // create a new job.
120  oneof optional_job_name {
121    string job_name = 3;
122  }
123  // Optional number of consumers. If set, the job's tasks will provide
124  // their elements to consumers round-robin.
125  oneof optional_num_consumers {
126    int64 num_consumers = 4;
127  }
128  // True if cross-trainer cache is enabled.
129  bool use_cross_trainer_cache = 5;
130  // Specifies which workers the client of this job reads from.
131  TargetWorkers target_workers = 6;
132}
133
134// Next tag: 2
135message GetOrCreateJobResponse {
136  int64 job_id = 1;
137}
138
139// Next tag: 3
140message GetOrCreateIterationRequest {
141  // The job to create an iteration for.
142  int64 job_id = 1;
143  // Which repetition of the job to read from.
144  int64 repetition = 2;
145}
146
147// Next tag: 2
148message GetOrCreateIterationResponse {
149  // An id for the client that will read from the iteration. When the client is
150  // done with the iteration, they should call ReleaseIterationClient with this
151  // id.
152  int64 iteration_client_id = 1;
153}
154
155// Next tag: 4
156message MaybeRemoveTaskRequest {
157  int64 task_id = 1;
158  int64 consumer_index = 2;
159  int64 round = 3;
160}
161
162// Next tag: 2
163message MaybeRemoveTaskResponse {
164  bool removed = 1;
165}
166
167// Next tag: 2
168message ReleaseIterationClientRequest {
169  int64 iteration_client_id = 1;
170}
171
172// Next tag: 1
173message ReleaseIterationClientResponse {}
174
175// Next tag: 5
176message ClientHeartbeatRequest {
177  reserved 3;
178  // The iteration client id to heartbeat for.
179  int64 iteration_client_id = 1;
180  // Reports which round the client is currently reading from when doing
181  // round-robin reads.
182  oneof optional_current_round {
183    int64 current_round = 2;
184  }
185  // Reports whether the client has successfully blocked the indicated round
186  // from starting. This enables the dispatcher to add a new task in the
187  // blocked round or later.
188  oneof optional_blocked_round {
189    int64 blocked_round = 4;
190  }
191}
192
193// Next tag: 5
194message ClientHeartbeatResponse {
195  // A list of all tasks that the client should read from.
196  repeated TaskInfo task_info = 1;
197  // Tells the client not to start the given round if possible.
198  oneof optional_block_round {
199    int64 block_round = 3;
200  }
201  // Whether the iteration has finished.
202  bool iteration_finished = 2;
203  // tf.data service deployment mode. Supported values are "REMOTE",
204  // "COLOCATED", and "HYBRID". If unspecified, it is assumed to be "REMOTE".
205  DeploymentMode deployment_mode = 4;
206}
207
208// Next tag: 3
209message WorkerInfo {
210  string address = 1;
211  reserved 2;
212}
213
214// Next tag: 1
215message GetWorkersRequest {}
216
217// Next tag: 2
218message GetWorkersResponse {
219  // A list of all workers.
220  repeated WorkerInfo workers = 1;
221}
222
223service DispatcherService {
224  // Performs a periodic worker heartbeat.
225  rpc WorkerHeartbeat(WorkerHeartbeatRequest) returns (WorkerHeartbeatResponse);
226
227  // Updates the dispatcher with information about the worker's state.
228  rpc WorkerUpdate(WorkerUpdateRequest) returns (WorkerUpdateResponse);
229
230  // Gets a dataset definition.
231  rpc GetDatasetDef(GetDatasetDefRequest) returns (GetDatasetDefResponse);
232
233  // Gets the next split for a given iteration.
234  rpc GetSplit(GetSplitRequest) returns (GetSplitResponse);
235
236  // Returns the API version of the server.
237  rpc GetVersion(GetVersionRequest) returns (GetVersionResponse);
238
239  // Registers a dataset with the server, or returns its id if it is already
240  // registered.
241  //
242  // The dataset is constructed in a new graph, so it must not refer to
243  // external resources or variables.
244  rpc GetOrRegisterDataset(GetOrRegisterDatasetRequest)
245      returns (GetOrRegisterDatasetResponse);
246
247  // Gets a job if it already exists, otherwise creates it.
248  rpc GetOrCreateJob(GetOrCreateJobRequest) returns (GetOrCreateJobResponse);
249
250  // Gets an iteration if it already exists, otherwise creates it.
251  rpc GetOrCreateIteration(GetOrCreateIterationRequest)
252      returns (GetOrCreateIterationResponse);
253
254  // Attempts to remove a task from a round-robin read iteration.
255  rpc MaybeRemoveTask(MaybeRemoveTaskRequest) returns (MaybeRemoveTaskResponse);
256
257  // Releases an iteration client so that an iteration may eventually be cleaned
258  // up.
259  rpc ReleaseIterationClient(ReleaseIterationClientRequest)
260      returns (ReleaseIterationClientResponse);
261
262  // Heartbeats from the client. This lets the dispatcher know that the client
263  // is still active, and gives the dispatcher a chance to notify the client
264  // of new tasks.
265  rpc ClientHeartbeat(ClientHeartbeatRequest) returns (ClientHeartbeatResponse);
266
267  // Reports a list of all workers registered with the dispatcher.
268  rpc GetWorkers(GetWorkersRequest) returns (GetWorkersResponse);
269
270  // Returns the data service metadata for the registered dataset.
271  rpc GetDataServiceMetadata(GetDataServiceMetadataRequest)
272      returns (GetDataServiceMetadataResponse);
273
274  // Returns the config of a data service cluster.
275  rpc GetDataServiceConfig(GetDataServiceConfigRequest)
276      returns (GetDataServiceConfigResponse);
277}
278