visionai/v1/annotations.proto

// Copyright 2023 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

syntax = "proto3";

package google.cloud.visionai.v1;

import "google/protobuf/struct.proto";
import "google/protobuf/timestamp.proto";

option csharp_namespace = "Google.Cloud.VisionAI.V1";
option go_package = "cloud.google.com/go/visionai/apiv1/visionaipb;visionaipb";
option java_multiple_files = true;
option java_outer_classname = "AnnotationsProto";
option java_package = "com.google.cloud.visionai.v1";
option php_namespace = "Google\\Cloud\\VisionAI\\V1";
option ruby_package = "Google::Cloud::VisionAI::V1";

// Enum describing all possible types of a stream annotation.
enum StreamAnnotationType {
  // Type UNSPECIFIED.
  STREAM_ANNOTATION_TYPE_UNSPECIFIED = 0;

  // active_zone annotation defines a polygon on top of the content from an
  // image/video based stream, following processing will only focus on the
  // content inside the active zone.
  STREAM_ANNOTATION_TYPE_ACTIVE_ZONE = 1;

  // crossing_line annotation defines a polyline on top of the content from an
  // image/video based Vision AI stream, events happening across the line will
  // be captured. For example, the counts of people who goes acroos the line
  // in Occupancy Analytic Processor.
  STREAM_ANNOTATION_TYPE_CROSSING_LINE = 2;
}

// Output format for Personal Protective Equipment Detection Operator.
message PersonalProtectiveEquipmentDetectionOutput {
  // The entity info for annotations from person detection prediction result.
  message PersonEntity {
    // Entity id.
    int64 person_entity_id = 1;
  }

  // The entity info for annotations from PPE detection prediction result.
  message PPEEntity {
    // Label id.
    int64 ppe_label_id = 1;

    // Human readable string of the label (Examples: helmet, glove, mask).
    string ppe_label_string = 2;

    // Human readable string of the super category label (Examples: head_cover,
    // hands_cover, face_cover).
    string ppe_supercategory_label_string = 3;

    // Entity id.
    int64 ppe_entity_id = 4;
  }

  // Bounding Box in the normalized coordinates.
  message NormalizedBoundingBox {
    // Min in x coordinate.
    float xmin = 1;

    // Min in y coordinate.
    float ymin = 2;

    // Width of the bounding box.
    float width = 3;

    // Height of the bounding box.
    float height = 4;
  }

  // PersonIdentified box contains the location and the entity info of the
  // person.
  message PersonIdentifiedBox {
    // An unique id for this box.
    int64 box_id = 1;

    // Bounding Box in the normalized coordinates.
    NormalizedBoundingBox normalized_bounding_box = 2;

    // Confidence score associated with this box.
    float confidence_score = 3;

    // Person entity info.
    PersonEntity person_entity = 4;
  }

  // PPEIdentified box contains the location and the entity info of the PPE.
  message PPEIdentifiedBox {
    // An unique id for this box.
    int64 box_id = 1;

    // Bounding Box in the normalized coordinates.
    NormalizedBoundingBox normalized_bounding_box = 2;

    // Confidence score associated with this box.
    float confidence_score = 3;

    // PPE entity info.
    PPEEntity ppe_entity = 4;
  }

  // Detected Person contains the detected person and their associated
  // ppes and their protecting information.
  message DetectedPerson {
    // The id of detected person.
    int64 person_id = 1;

    // The info of detected person identified box.
    PersonIdentifiedBox detected_person_identified_box = 2;

    // The info of detected person associated ppe identified boxes.
    repeated PPEIdentifiedBox detected_ppe_identified_boxes = 3;

    // Coverage score for each body part.
    // Coverage score for face.
    optional float face_coverage_score = 4;

    // Coverage score for eyes.
    optional float eyes_coverage_score = 5;

    // Coverage score for head.
    optional float head_coverage_score = 6;

    // Coverage score for hands.
    optional float hands_coverage_score = 7;

    // Coverage score for body.
    optional float body_coverage_score = 8;

    // Coverage score for feet.
    optional float feet_coverage_score = 9;
  }

  // Current timestamp.
  google.protobuf.Timestamp current_time = 1;

  // A list of DetectedPersons.
  repeated DetectedPerson detected_persons = 2;
}

// Prediction output format for Generic Object Detection.
message ObjectDetectionPredictionResult {
  // The entity info for annotations from object detection prediction result.
  message Entity {
    // Label id.
    int64 label_id = 1;

    // Human readable string of the label.
    string label_string = 2;
  }

  // Identified box contains location and the entity of the object.
  message IdentifiedBox {
    // Bounding Box in the normalized coordinates.
    message NormalizedBoundingBox {
      // Min in x coordinate.
      float xmin = 1;

      // Min in y coordinate.
      float ymin = 2;

      // Width of the bounding box.
      float width = 3;

      // Height of the bounding box.
      float height = 4;
    }

    // An unique id for this box.
    int64 box_id = 1;

    // Bounding Box in the normalized coordinates.
    NormalizedBoundingBox normalized_bounding_box = 2;

    // Confidence score associated with this box.
    float confidence_score = 3;

    // Entity of this box.
    Entity entity = 4;
  }

  // Current timestamp.
  google.protobuf.Timestamp current_time = 1;

  // A list of identified boxes.
  repeated IdentifiedBox identified_boxes = 2;
}

// Prediction output format for Image Object Detection.
message ImageObjectDetectionPredictionResult {
  // The resource IDs of the AnnotationSpecs that had been identified, ordered
  // by the confidence score descendingly. It is the id segment instead of full
  // resource name.
  repeated int64 ids = 1;

  // The display names of the AnnotationSpecs that had been identified, order
  // matches the IDs.
  repeated string display_names = 2;

  // The Model's confidences in correctness of the predicted IDs, higher value
  // means higher confidence. Order matches the Ids.
  repeated float confidences = 3;

  // Bounding boxes, i.e. the rectangles over the image, that pinpoint
  // the found AnnotationSpecs. Given in order that matches the IDs. Each
  // bounding box is an array of 4 numbers `xMin`, `xMax`, `yMin`, and
  // `yMax`, which represent the extremal coordinates of the box. They are
  // relative to the image size, and the point 0,0 is in the top left
  // of the image.
  repeated google.protobuf.ListValue bboxes = 4;
}

// Prediction output format for Image and Text Classification.
message ClassificationPredictionResult {
  // The resource IDs of the AnnotationSpecs that had been identified.
  repeated int64 ids = 1;

  // The display names of the AnnotationSpecs that had been identified, order
  // matches the IDs.
  repeated string display_names = 2;

  // The Model's confidences in correctness of the predicted IDs, higher value
  // means higher confidence. Order matches the Ids.
  repeated float confidences = 3;
}

// Prediction output format for Image Segmentation.
message ImageSegmentationPredictionResult {
  // A PNG image where each pixel in the mask represents the category in which
  // the pixel in the original image was predicted to belong to. The size of
  // this image will be the same as the original image. The mapping between the
  // AnntoationSpec and the color can be found in model's metadata. The model
  // will choose the most likely category and if none of the categories reach
  // the confidence threshold, the pixel will be marked as background.
  string category_mask = 1;

  // A one channel image which is encoded as an 8bit lossless PNG. The size of
  // the image will be the same as the original image. For a specific pixel,
  // darker color means less confidence in correctness of the cateogry in the
  // categoryMask for the corresponding pixel. Black means no confidence and
  // white means complete confidence.
  string confidence_mask = 2;
}

// Prediction output format for Video Action Recognition.
message VideoActionRecognitionPredictionResult {
  // Each IdentifiedAction is one particular identification of an action
  // specified with the AnnotationSpec id, display_name and the associated
  // confidence score.
  message IdentifiedAction {
    // The resource ID of the AnnotationSpec that had been identified.
    string id = 1;

    // The display name of the AnnotationSpec that had been identified.
    string display_name = 2;

    // The Model's confidence in correction of this identification, higher
    // value means higher confidence.
    float confidence = 3;
  }

  // The beginning, inclusive, of the video's time segment in which the
  // actions have been identified.
  google.protobuf.Timestamp segment_start_time = 1;

  // The end, inclusive, of the video's time segment in which the actions have
  // been identified. Particularly, if the end is the same as the start, it
  // means the identification happens on a specific video frame.
  google.protobuf.Timestamp segment_end_time = 2;

  // All of the actions identified in the time range.
  repeated IdentifiedAction actions = 3;
}

// Prediction output format for Video Object Tracking.
message VideoObjectTrackingPredictionResult {
  // Boundingbox for detected object. I.e. the rectangle over the video frame
  // pinpointing the found AnnotationSpec. The coordinates are relative to the
  // frame size, and the point 0,0 is in the top left of the frame.
  message BoundingBox {
    // The leftmost coordinate of the bounding box.
    float x_min = 1;

    // The rightmost coordinate of the bounding box.
    float x_max = 2;

    // The topmost coordinate of the bounding box.
    float y_min = 3;

    // The bottommost coordinate of the bounding box.
    float y_max = 4;
  }

  // Each DetectedObject is one particular identification of an object
  // specified with the AnnotationSpec id and display_name, the bounding box,
  // the associated confidence score and the corresponding track_id.
  message DetectedObject {
    // The resource ID of the AnnotationSpec that had been identified.
    string id = 1;

    // The display name of the AnnotationSpec that had been identified.
    string display_name = 2;

    // Boundingbox.
    BoundingBox bounding_box = 3;

    // The Model's confidence in correction of this identification, higher
    // value means higher confidence.
    float confidence = 4;

    // The same object may be identified on muitiple frames which are typical
    // adjacent. The set of frames where a particular object has been detected
    // form a track. This track_id can be used to trace down all frames for an
    // detected object.
    int64 track_id = 5;
  }

  // The beginning, inclusive, of the video's time segment in which the
  // current identifications happens.
  google.protobuf.Timestamp segment_start_time = 1;

  // The end, inclusive, of the video's time segment in which the current
  // identifications happen. Particularly, if the end is the same as the start,
  // it means the identifications happen on a specific video frame.
  google.protobuf.Timestamp segment_end_time = 2;

  // All of the objects detected in the specified time range.
  repeated DetectedObject objects = 3;
}

// Prediction output format for Video Classification.
message VideoClassificationPredictionResult {
  // Each IdentifiedClassification is one particular identification of an
  // classification specified with the AnnotationSpec id and display_name,
  // and the associated confidence score.
  message IdentifiedClassification {
    // The resource ID of the AnnotationSpec that had been identified.
    string id = 1;

    // The display name of the AnnotationSpec that had been identified.
    string display_name = 2;

    // The Model's confidence in correction of this identification, higher
    // value means higher confidence.
    float confidence = 3;
  }

  // The beginning, inclusive, of the video's time segment in which the
  // classifications have been identified.
  google.protobuf.Timestamp segment_start_time = 1;

  // The end, inclusive, of the video's time segment in which the
  // classifications have been identified. Particularly, if the end is the same
  // as the start, it means the identification happens on a specific video
  // frame.
  google.protobuf.Timestamp segment_end_time = 2;

  // All of the classifications identified in the time range.
  repeated IdentifiedClassification classifications = 3;
}

// The prediction result proto for occupancy counting.
message OccupancyCountingPredictionResult {
  // The entity info for annotations from occupancy counting operator.
  message Entity {
    // Label id.
    int64 label_id = 1;

    // Human readable string of the label.
    string label_string = 2;
  }

  // Identified box contains location and the entity of the object.
  message IdentifiedBox {
    // Bounding Box in the normalized coordinates.
    message NormalizedBoundingBox {
      // Min in x coordinate.
      float xmin = 1;

      // Min in y coordinate.
      float ymin = 2;

      // Width of the bounding box.
      float width = 3;

      // Height of the bounding box.
      float height = 4;
    }

    // An unique id for this box.
    int64 box_id = 1;

    // Bounding Box in the normalized coordinates.
    NormalizedBoundingBox normalized_bounding_box = 2;

    // Confidence score associated with this box.
    float score = 3;

    // Entity of this box.
    Entity entity = 4;

    // An unique id to identify a track. It should be consistent across frames.
    // It only exists if tracking is enabled.
    int64 track_id = 5;
  }

  // The statistics info for annotations from occupancy counting operator.
  message Stats {
    // The object info and instant count for annotations from occupancy counting
    // operator.
    message ObjectCount {
      // Entity of this object.
      Entity entity = 1;

      // Count of the object.
      int32 count = 2;
    }

    // The object info and accumulated count for annotations from occupancy
    // counting operator.
    message AccumulatedObjectCount {
      // The start time of the accumulated count.
      google.protobuf.Timestamp start_time = 1;

      // The object count for the accumulated count.
      ObjectCount object_count = 2;
    }

    // Message for Crossing line count.
    message CrossingLineCount {
      // Line annotation from the user.
      StreamAnnotation annotation = 1;

      // The direction that follows the right hand rule.
      repeated ObjectCount positive_direction_counts = 2;

      // The direction that is opposite to the right hand rule.
      repeated ObjectCount negative_direction_counts = 3;

      // The accumulated positive count.
      repeated AccumulatedObjectCount accumulated_positive_direction_counts = 4;

      // The accumulated negative count.
      repeated AccumulatedObjectCount accumulated_negative_direction_counts = 5;
    }

    // Message for the active zone count.
    message ActiveZoneCount {
      // Active zone annotation from the user.
      StreamAnnotation annotation = 1;

      // Counts in the zone.
      repeated ObjectCount counts = 2;
    }

    // Counts of the full frame.
    repeated ObjectCount full_frame_count = 1;

    // Crossing line counts.
    repeated CrossingLineCount crossing_line_counts = 2;

    // Active zone counts.
    repeated ActiveZoneCount active_zone_counts = 3;
  }

  // The track info for annotations from occupancy counting operator.
  message TrackInfo {
    // An unique id to identify a track. It should be consistent across frames.
    string track_id = 1;

    // Start timestamp of this track.
    google.protobuf.Timestamp start_time = 2;
  }

  // The dwell time info for annotations from occupancy counting operator.
  message DwellTimeInfo {
    // An unique id to identify a track. It should be consistent across frames.
    string track_id = 1;

    // The unique id for the zone in which the object is dwelling/waiting.
    string zone_id = 2;

    // The beginning time when a dwelling object has been identified in a zone.
    google.protobuf.Timestamp dwell_start_time = 3;

    // The end time when a dwelling object has exited in a zone.
    google.protobuf.Timestamp dwell_end_time = 4;
  }

  // Current timestamp.
  google.protobuf.Timestamp current_time = 1;

  // A list of identified boxes.
  repeated IdentifiedBox identified_boxes = 2;

  // Detection statistics.
  Stats stats = 3;

  // Track related information. All the tracks that are live at this timestamp.
  // It only exists if tracking is enabled.
  repeated TrackInfo track_info = 4;

  // Dwell time related information. All the tracks that are live in a given
  // zone with a start and end dwell time timestamp
  repeated DwellTimeInfo dwell_time_info = 5;

  // The presentation timestamp of the frame.
  optional int64 pts = 6;
}

// message about annotations about Vision AI stream resource.
message StreamAnnotation {
  oneof annotation_payload {
    // Annotation for type ACTIVE_ZONE
    NormalizedPolygon active_zone = 5;

    // Annotation for type CROSSING_LINE
    NormalizedPolyline crossing_line = 6;
  }

  // ID of the annotation. It must be unique when used in the certain context.
  // For example, all the annotations to one input streams of a Vision AI
  // application.
  string id = 1;

  // User-friendly name for the annotation.
  string display_name = 2;

  // The Vision AI stream resource name.
  string source_stream = 3;

  // The actual type of Annotation.
  StreamAnnotationType type = 4;
}

// A wrapper of repeated StreamAnnotation.
message StreamAnnotations {
  // Multiple annotations.
  repeated StreamAnnotation stream_annotations = 1;
}

// Normalized Polygon.
message NormalizedPolygon {
  // The bounding polygon normalized vertices. Top left corner of the image
  // will be [0, 0].
  repeated NormalizedVertex normalized_vertices = 1;
}

// Normalized Pplyline, which represents a curve consisting of connected
// straight-line segments.
message NormalizedPolyline {
  // A sequence of vertices connected by straight lines.
  repeated NormalizedVertex normalized_vertices = 1;
}

// A vertex represents a 2D point in the image.
// NOTE: the normalized vertex coordinates are relative to the original image
// and range from 0 to 1.
message NormalizedVertex {
  // X coordinate.
  float x = 1;

  // Y coordinate.
  float y = 2;
}

// Message of essential metadata of App Platform.
// This message is usually attached to a certain processor output annotation for
// customer to identify the source of the data.
message AppPlatformMetadata {
  // The application resource name.
  string application = 1;

  // The instance resource id. Instance is the nested resource of application
  // under collection 'instances'.
  string instance_id = 2;

  // The node name of the application graph.
  string node = 3;

  // The referred processor resource name of the application node.
  string processor = 4;
}

// For any cloud function based customer processing logic, customer's cloud
// function is expected to receive AppPlatformCloudFunctionRequest as request
// and send back AppPlatformCloudFunctionResponse as response.
// Message of request from AppPlatform to Cloud Function.
message AppPlatformCloudFunctionRequest {
  // A general annotation message that uses struct format to represent different
  // concrete annotation protobufs.
  message StructedInputAnnotation {
    // The ingestion time of the current annotation.
    int64 ingestion_time_micros = 1;

    // The struct format of the actual annotation.
    google.protobuf.Struct annotation = 2;
  }

  // The metadata of the AppPlatform for customer to identify the source of the
  // payload.
  AppPlatformMetadata app_platform_metadata = 1;

  // The actual annotations to be processed by the customized Cloud Function.
  repeated StructedInputAnnotation annotations = 2;
}

// Message of the response from customer's Cloud Function to AppPlatform.
message AppPlatformCloudFunctionResponse {
  // A general annotation message that uses struct format to represent different
  // concrete annotation protobufs.
  message StructedOutputAnnotation {
    // The struct format of the actual annotation.
    google.protobuf.Struct annotation = 1;
  }

  // The modified annotations that is returned back to AppPlatform.
  // If the annotations fields are empty, then those annotations will be dropped
  // by AppPlatform.
  repeated StructedOutputAnnotation annotations = 2;

  // If set to true, AppPlatform will use original annotations instead of
  // dropping them, even if it is empty in the annotations filed.
  bool annotation_passthrough = 3;

  // The event notifications that is returned back to AppPlatform. Typically it
  // will then be configured to be consumed/forwared to a operator that handles
  // events, such as Pub/Sub operator.
  repeated AppPlatformEventBody events = 4;
}

// Message of content of appPlatform event
message AppPlatformEventBody {
  // Human readable string of the event like "There are more than 6 people in
  // the scene". or "Shelf is empty!".
  string event_message = 1;

  // For the case of Pub/Sub, it will be stored in the message attributes.
  // pubsub.proto
  google.protobuf.Struct payload = 2;

  // User defined Event Id, used to classify event, within a delivery interval,
  // events from the same application instance with the same id will be
  // de-duplicated & only first one will be sent out. Empty event_id will be
  // treated as "".
  string event_id = 3;
}