xref: /aosp_15_r20/external/tensorflow/tensorflow/core/protobuf/service_config.proto (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1syntax = "proto3";
2
3package tensorflow.data.experimental;
4
5import "tensorflow/core/protobuf/data_service.proto";
6
7option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto";
8
9// Configuration for a tf.data service DispatchServer.
10// Next id: 10
11message DispatcherConfig {
12  // The port for the dispatcher to bind to. A value of 0 indicates that the
13  // dispatcher may bind to any available port.
14  int64 port = 1;
15  // The protocol for the dispatcher to use when connecting to workers.
16  string protocol = 2;
17  // A work directory to use for storing dispatcher state, and for recovering
18  // during restarts. The empty string indicates not to use any work directory.
19  string work_dir = 3;
20  // Whether to run in fault tolerant mode, where dispatcher state is saved
21  // across restarts. Requires that `work_dir` is nonempty.
22  bool fault_tolerant_mode = 4;
23  // (Optional.) If the job uses auto-sharding, it needs to specify a fixed list
24  // of worker addresses that will register with the dispatcher. The worker
25  // addresses should be in the format "host" or "host:port", where "port" is an
26  // integer, named port, or %port% to match any port.
27  repeated string worker_addresses = 7;
28  // (Optional.) tf.data service deployment mode. Supported values are "REMOTE",
29  // "COLOCATED", and "HYBRID". If unspecified, it is assumed to be "REMOTE".
30  DeploymentMode deployment_mode = 9;
31  // How often the dispatcher should scan through to delete old and unused
32  // jobs. A value of 0 indicates that the decision should be left up to the
33  // runtime.
34  int64 job_gc_check_interval_ms = 5;
35  // How long a job needs to be unused before it becomes a candidate for garbage
36  // collection. A value of -1 indicates that jobs should never be garbage
37  // collected. A value of 0 indicates that the decision should be left up to
38  // the runtime.
39  int64 job_gc_timeout_ms = 6;
40  // How long to wait before garbage-collecting a client that hasn't
41  // heartbeated to the dispatcher. A value of 0 indicates that the timeout
42  // should be left to the runtime.
43  int64 client_timeout_ms = 8;
44}
45
46// Configuration for a tf.data service WorkerServer.
47// Next id: 12
48message WorkerConfig {
49  // The port for the worker to bind to. A value of 0 indicates that the
50  // worker may bind to any available port.
51  int64 port = 1;
52  // The protocol for the worker to use when connecting to the dispatcher.
53  string protocol = 2;
54  // The address of the dispatcher to register with.
55  string dispatcher_address = 3;
56  // The address of the worker server. The substring "%port%", if specified,
57  // will be replaced with the worker's bound port. This is useful when the port
58  // is set to `0`.
59  string worker_address = 4;
60  // Tags attached to the worker. This allows reading from selected workers.
61  // For example, by applying a "COLOCATED" tag, tf.data service is able to read
62  // from the local tf.data worker if one exists, then from off-TF-host workers,
63  // to avoid cross-TF-host reads.
64  repeated string worker_tags = 10;
65  // How often the worker should heartbeat to the master. A value of 0 indicates
66  // that the decision should be left up to the runtime.
67  int64 heartbeat_interval_ms = 5;
68  // How long to retry requests to the dispatcher before giving up and reporting
69  // an error. A value of 0 indicates that the decision should be left up to the
70  // runtime.
71  int64 dispatcher_timeout_ms = 6;
72  // The protocol for the worker to use when transferring data to clients.
73  string data_transfer_protocol = 7;
74  // The data transfer address of the worker server. The substring "%port%", if
75  // specified, will be replaced with the worker's bound port. This is useful
76  // when the port is set to `0`.
77  string data_transfer_address = 8;
78  // Maximum size of the cross-trainer cache in bytes. If enabled, make sure
79  // your training job provides sufficient memory resources.
80  int64 cross_trainer_cache_size_bytes = 11;
81  // When shutting down a worker, how long to wait for the gRPC server to
82  // process the final requests. This is used to achieve clean shutdown in unit
83  // tests.
84  int64 shutdown_quiet_period_ms = 9;
85}
86