1syntax = "proto3"; 2 3package tensorflow.data.experimental; 4 5import "tensorflow/core/protobuf/data_service.proto"; 6 7option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto"; 8 9// Configuration for a tf.data service DispatchServer. 10// Next id: 10 11message DispatcherConfig { 12 // The port for the dispatcher to bind to. A value of 0 indicates that the 13 // dispatcher may bind to any available port. 14 int64 port = 1; 15 // The protocol for the dispatcher to use when connecting to workers. 16 string protocol = 2; 17 // A work directory to use for storing dispatcher state, and for recovering 18 // during restarts. The empty string indicates not to use any work directory. 19 string work_dir = 3; 20 // Whether to run in fault tolerant mode, where dispatcher state is saved 21 // across restarts. Requires that `work_dir` is nonempty. 22 bool fault_tolerant_mode = 4; 23 // (Optional.) If the job uses auto-sharding, it needs to specify a fixed list 24 // of worker addresses that will register with the dispatcher. The worker 25 // addresses should be in the format "host" or "host:port", where "port" is an 26 // integer, named port, or %port% to match any port. 27 repeated string worker_addresses = 7; 28 // (Optional.) tf.data service deployment mode. Supported values are "REMOTE", 29 // "COLOCATED", and "HYBRID". If unspecified, it is assumed to be "REMOTE". 30 DeploymentMode deployment_mode = 9; 31 // How often the dispatcher should scan through to delete old and unused 32 // jobs. A value of 0 indicates that the decision should be left up to the 33 // runtime. 34 int64 job_gc_check_interval_ms = 5; 35 // How long a job needs to be unused before it becomes a candidate for garbage 36 // collection. A value of -1 indicates that jobs should never be garbage 37 // collected. A value of 0 indicates that the decision should be left up to 38 // the runtime. 39 int64 job_gc_timeout_ms = 6; 40 // How long to wait before garbage-collecting a client that hasn't 41 // heartbeated to the dispatcher. A value of 0 indicates that the timeout 42 // should be left to the runtime. 43 int64 client_timeout_ms = 8; 44} 45 46// Configuration for a tf.data service WorkerServer. 47// Next id: 12 48message WorkerConfig { 49 // The port for the worker to bind to. A value of 0 indicates that the 50 // worker may bind to any available port. 51 int64 port = 1; 52 // The protocol for the worker to use when connecting to the dispatcher. 53 string protocol = 2; 54 // The address of the dispatcher to register with. 55 string dispatcher_address = 3; 56 // The address of the worker server. The substring "%port%", if specified, 57 // will be replaced with the worker's bound port. This is useful when the port 58 // is set to `0`. 59 string worker_address = 4; 60 // Tags attached to the worker. This allows reading from selected workers. 61 // For example, by applying a "COLOCATED" tag, tf.data service is able to read 62 // from the local tf.data worker if one exists, then from off-TF-host workers, 63 // to avoid cross-TF-host reads. 64 repeated string worker_tags = 10; 65 // How often the worker should heartbeat to the master. A value of 0 indicates 66 // that the decision should be left up to the runtime. 67 int64 heartbeat_interval_ms = 5; 68 // How long to retry requests to the dispatcher before giving up and reporting 69 // an error. A value of 0 indicates that the decision should be left up to the 70 // runtime. 71 int64 dispatcher_timeout_ms = 6; 72 // The protocol for the worker to use when transferring data to clients. 73 string data_transfer_protocol = 7; 74 // The data transfer address of the worker server. The substring "%port%", if 75 // specified, will be replaced with the worker's bound port. This is useful 76 // when the port is set to `0`. 77 string data_transfer_address = 8; 78 // Maximum size of the cross-trainer cache in bytes. If enabled, make sure 79 // your training job provides sufficient memory resources. 80 int64 cross_trainer_cache_size_bytes = 11; 81 // When shutting down a worker, how long to wait for the gRPC server to 82 // process the final requests. This is used to achieve clean shutdown in unit 83 // tests. 84 int64 shutdown_quiet_period_ms = 9; 85} 86