aiplatform/v1beta1/evaluation_service.proto

// Copyright 2023 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

syntax = "proto3";

package google.cloud.aiplatform.v1beta1;

import "google/api/annotations.proto";
import "google/api/client.proto";
import "google/api/field_behavior.proto";
import "google/api/resource.proto";

option csharp_namespace = "Google.Cloud.AIPlatform.V1Beta1";
option go_package = "cloud.google.com/go/aiplatform/apiv1beta1/aiplatformpb;aiplatformpb";
option java_multiple_files = true;
option java_outer_classname = "EvaluationServiceProto";
option java_package = "com.google.cloud.aiplatform.v1beta1";
option php_namespace = "Google\\Cloud\\AIPlatform\\V1beta1";
option ruby_package = "Google::Cloud::AIPlatform::V1beta1";

// Vertex AI Online Evaluation Service.
service EvaluationService {
  option (google.api.default_host) = "aiplatform.googleapis.com";
  option (google.api.oauth_scopes) =
      "https://www.googleapis.com/auth/cloud-platform";

  // Evaluates instances based on a given metric.
  rpc EvaluateInstances(EvaluateInstancesRequest)
      returns (EvaluateInstancesResponse) {
    option (google.api.http) = {
      post: "/v1beta1/{location=projects/*/locations/*}:evaluateInstances"
      body: "*"
    };
  }
}

// Pairwise prediction autorater preference.
enum PairwiseChoice {
  // Unspecified prediction choice.
  PAIRWISE_CHOICE_UNSPECIFIED = 0;

  // Baseline prediction wins
  BASELINE = 1;

  // Candidate prediction wins
  CANDIDATE = 2;

  // Winner cannot be determined
  TIE = 3;
}

// Request message for EvaluationService.EvaluateInstances.
message EvaluateInstancesRequest {
  // Instances and specs for evaluation
  oneof metric_inputs {
    // Auto metric instances.
    // Instances and metric spec for exact match metric.
    ExactMatchInput exact_match_input = 2;

    // Instances and metric spec for bleu metric.
    BleuInput bleu_input = 3;

    // Instances and metric spec for rouge metric.
    RougeInput rouge_input = 4;

    // LLM-based metric instance.
    // General text generation metrics, applicable to other categories.
    // Input for fluency metric.
    FluencyInput fluency_input = 5;

    // Input for coherence metric.
    CoherenceInput coherence_input = 6;

    // Input for safety metric.
    SafetyInput safety_input = 8;

    // Input for groundedness metric.
    GroundednessInput groundedness_input = 9;

    // Input for fulfillment metric.
    FulfillmentInput fulfillment_input = 12;

    // Input for summarization quality metric.
    SummarizationQualityInput summarization_quality_input = 7;

    // Input for pairwise summarization quality metric.
    PairwiseSummarizationQualityInput pairwise_summarization_quality_input = 23;

    // Input for summarization helpfulness metric.
    SummarizationHelpfulnessInput summarization_helpfulness_input = 14;

    // Input for summarization verbosity metric.
    SummarizationVerbosityInput summarization_verbosity_input = 15;

    // Input for question answering quality metric.
    QuestionAnsweringQualityInput question_answering_quality_input = 10;

    // Input for pairwise question answering quality metric.
    PairwiseQuestionAnsweringQualityInput
        pairwise_question_answering_quality_input = 24;

    // Input for question answering relevance metric.
    QuestionAnsweringRelevanceInput question_answering_relevance_input = 16;

    // Input for question answering helpfulness
    // metric.
    QuestionAnsweringHelpfulnessInput question_answering_helpfulness_input = 17;

    // Input for question answering correctness
    // metric.
    QuestionAnsweringCorrectnessInput question_answering_correctness_input = 18;

    // Tool call metric instances.
    // Input for tool call valid metric.
    ToolCallValidInput tool_call_valid_input = 19;

    // Input for tool name match metric.
    ToolNameMatchInput tool_name_match_input = 20;

    // Input for tool parameter key match metric.
    ToolParameterKeyMatchInput tool_parameter_key_match_input = 21;

    // Input for tool parameter key value match metric.
    ToolParameterKVMatchInput tool_parameter_kv_match_input = 22;
  }

  // Required. The resource name of the Location to evaluate the instances.
  // Format: `projects/{project}/locations/{location}`
  string location = 1 [
    (google.api.field_behavior) = REQUIRED,
    (google.api.resource_reference) = {
      type: "locations.googleapis.com/Location"
    }
  ];
}

// Response message for EvaluationService.EvaluateInstances.
message EvaluateInstancesResponse {
  // Evaluation results will be served in the same order as presented in
  // EvaluationRequest.instances.
  oneof evaluation_results {
    // Auto metric evaluation results.
    // Results for exact match metric.
    ExactMatchResults exact_match_results = 1;

    // Results for bleu metric.
    BleuResults bleu_results = 2;

    // Results for rouge metric.
    RougeResults rouge_results = 3;

    // LLM-based metric evaluation result.
    // General text generation metrics, applicable to other categories.
    // Result for fluency metric.
    FluencyResult fluency_result = 4;

    // Result for coherence metric.
    CoherenceResult coherence_result = 5;

    // Result for safety metric.
    SafetyResult safety_result = 7;

    // Result for groundedness metric.
    GroundednessResult groundedness_result = 8;

    // Result for fulfillment metric.
    FulfillmentResult fulfillment_result = 11;

    // Summarization only metrics.
    // Result for summarization quality metric.
    SummarizationQualityResult summarization_quality_result = 6;

    // Result for pairwise summarization quality metric.
    PairwiseSummarizationQualityResult pairwise_summarization_quality_result =
        22;

    // Result for summarization helpfulness metric.
    SummarizationHelpfulnessResult summarization_helpfulness_result = 13;

    // Result for summarization verbosity metric.
    SummarizationVerbosityResult summarization_verbosity_result = 14;

    // Question answering only metrics.
    // Result for question answering quality metric.
    QuestionAnsweringQualityResult question_answering_quality_result = 9;

    // Result for pairwise question answering quality metric.
    PairwiseQuestionAnsweringQualityResult
        pairwise_question_answering_quality_result = 23;

    // Result for question answering relevance metric.
    QuestionAnsweringRelevanceResult question_answering_relevance_result = 15;

    // Result for question answering helpfulness metric.
    QuestionAnsweringHelpfulnessResult question_answering_helpfulness_result =
        16;

    // Result for question answering correctness metric.
    QuestionAnsweringCorrectnessResult question_answering_correctness_result =
        17;

    // Tool call metrics.
    //  Results for tool call valid metric.
    ToolCallValidResults tool_call_valid_results = 18;

    // Results for tool name match metric.
    ToolNameMatchResults tool_name_match_results = 19;

    // Results for tool parameter key match  metric.
    ToolParameterKeyMatchResults tool_parameter_key_match_results = 20;

    // Results for tool parameter key value match metric.
    ToolParameterKVMatchResults tool_parameter_kv_match_results = 21;
  }
}

// Input for exact match metric.
message ExactMatchInput {
  // Required. Spec for exact match metric.
  ExactMatchSpec metric_spec = 1 [(google.api.field_behavior) = REQUIRED];

  // Required. Repeated exact match instances.
  repeated ExactMatchInstance instances = 2
      [(google.api.field_behavior) = REQUIRED];
}

// Spec for exact match instance.
message ExactMatchInstance {
  // Required. Output of the evaluated model.
  optional string prediction = 1 [(google.api.field_behavior) = REQUIRED];

  // Required. Ground truth used to compare against the prediction.
  optional string reference = 2 [(google.api.field_behavior) = REQUIRED];
}

// Spec for exact match metric - returns 1 if prediction and reference exactly
// matches, otherwise 0.
message ExactMatchSpec {}

// Results for exact match metric.
message ExactMatchResults {
  // Output only. Exact match metric values.
  repeated ExactMatchMetricValue exact_match_metric_values = 1
      [(google.api.field_behavior) = OUTPUT_ONLY];
}

// Exact match metric value for an instance.
message ExactMatchMetricValue {
  // Output only. Exact match score.
  optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
}

// Input for bleu metric.
message BleuInput {
  // Required. Spec for bleu score metric.
  BleuSpec metric_spec = 1 [(google.api.field_behavior) = REQUIRED];

  // Required. Repeated bleu instances.
  repeated BleuInstance instances = 2 [(google.api.field_behavior) = REQUIRED];
}

// Spec for bleu instance.
message BleuInstance {
  // Required. Output of the evaluated model.
  optional string prediction = 1 [(google.api.field_behavior) = REQUIRED];

  // Required. Ground truth used to compare against the prediction.
  optional string reference = 2 [(google.api.field_behavior) = REQUIRED];
}

// Spec for bleu score metric - calculates the precision of n-grams in the
// prediction as compared to reference - returns a score ranging between 0 to 1.
message BleuSpec {}

// Results for bleu metric.
message BleuResults {
  // Output only. Bleu metric values.
  repeated BleuMetricValue bleu_metric_values = 1
      [(google.api.field_behavior) = OUTPUT_ONLY];
}

// Bleu metric value for an instance.
message BleuMetricValue {
  // Output only. Bleu score.
  optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
}

// Input for rouge metric.
message RougeInput {
  // Required. Spec for rouge score metric.
  RougeSpec metric_spec = 1 [(google.api.field_behavior) = REQUIRED];

  // Required. Repeated rouge instances.
  repeated RougeInstance instances = 2 [(google.api.field_behavior) = REQUIRED];
}

// Spec for rouge instance.
message RougeInstance {
  // Required. Output of the evaluated model.
  optional string prediction = 1 [(google.api.field_behavior) = REQUIRED];

  // Required. Ground truth used to compare against the prediction.
  optional string reference = 2 [(google.api.field_behavior) = REQUIRED];
}

// Spec for rouge score metric - calculates the recall of n-grams in prediction
// as compared to reference - returns a score ranging between 0 and 1.
message RougeSpec {
  // Optional. Supported rouge types are rougen[1-9], rougeL and rougeLsum.
  string rouge_type = 1 [(google.api.field_behavior) = OPTIONAL];

  // Optional. Whether to use stemmer to compute rouge score.
  bool use_stemmer = 2 [(google.api.field_behavior) = OPTIONAL];

  // Optional. Whether to split summaries while using rougeLsum.
  bool split_summaries = 3 [(google.api.field_behavior) = OPTIONAL];
}

// Results for rouge metric.
message RougeResults {
  // Output only. Rouge metric values.
  repeated RougeMetricValue rouge_metric_values = 1
      [(google.api.field_behavior) = OUTPUT_ONLY];
}

// Rouge metric value for an instance.
message RougeMetricValue {
  // Output only. Rouge score.
  optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
}

// Input for coherence metric.
message CoherenceInput {
  // Required. Spec for coherence score metric.
  CoherenceSpec metric_spec = 1 [(google.api.field_behavior) = REQUIRED];

  // Required. Coherence instance.
  CoherenceInstance instance = 2 [(google.api.field_behavior) = REQUIRED];
}

// Spec for coherence instance.
message CoherenceInstance {
  // Required. Output of the evaluated model.
  optional string prediction = 1 [(google.api.field_behavior) = REQUIRED];
}

// Spec for coherence score metric.
message CoherenceSpec {
  // Optional. Which version to use for evaluation.
  int32 version = 1 [(google.api.field_behavior) = OPTIONAL];
}

// Spec for coherence result.
message CoherenceResult {
  // Output only. Coherence score.
  optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY];

  // Output only. Explanation for coherence score.
  string explanation = 2 [(google.api.field_behavior) = OUTPUT_ONLY];

  // Output only. Confidence for coherence score.
  optional float confidence = 3 [(google.api.field_behavior) = OUTPUT_ONLY];
}

// Input for fluency metric.
message FluencyInput {
  // Required. Spec for fluency score metric.
  FluencySpec metric_spec = 1 [(google.api.field_behavior) = REQUIRED];

  // Required. Fluency instance.
  FluencyInstance instance = 2 [(google.api.field_behavior) = REQUIRED];
}

// Spec for fluency instance.
message FluencyInstance {
  // Required. Output of the evaluated model.
  optional string prediction = 1 [(google.api.field_behavior) = REQUIRED];
}

// Spec for fluency score metric.
message FluencySpec {
  // Optional. Which version to use for evaluation.
  int32 version = 1 [(google.api.field_behavior) = OPTIONAL];
}

// Spec for fluency result.
message FluencyResult {
  // Output only. Fluency score.
  optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY];

  // Output only. Explanation for fluency score.
  string explanation = 2 [(google.api.field_behavior) = OUTPUT_ONLY];

  // Output only. Confidence for fluency score.
  optional float confidence = 3 [(google.api.field_behavior) = OUTPUT_ONLY];
}

// Input for safety metric.
message SafetyInput {
  // Required. Spec for safety metric.
  SafetySpec metric_spec = 1 [(google.api.field_behavior) = REQUIRED];

  // Required. Safety instance.
  SafetyInstance instance = 2 [(google.api.field_behavior) = REQUIRED];
}

// Spec for safety instance.
message SafetyInstance {
  // Required. Output of the evaluated model.
  optional string prediction = 1 [(google.api.field_behavior) = REQUIRED];
}

// Spec for safety metric.
message SafetySpec {
  // Optional. Which version to use for evaluation.
  int32 version = 1 [(google.api.field_behavior) = OPTIONAL];
}

// Spec for safety result.
message SafetyResult {
  // Output only. Safety score.
  optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY];

  // Output only. Explanation for safety score.
  string explanation = 2 [(google.api.field_behavior) = OUTPUT_ONLY];

  // Output only. Confidence for safety score.
  optional float confidence = 3 [(google.api.field_behavior) = OUTPUT_ONLY];
}

// Input for groundedness metric.
message GroundednessInput {
  // Required. Spec for groundedness metric.
  GroundednessSpec metric_spec = 1 [(google.api.field_behavior) = REQUIRED];

  // Required. Groundedness instance.
  GroundednessInstance instance = 2 [(google.api.field_behavior) = REQUIRED];
}

// Spec for groundedness instance.
message GroundednessInstance {
  // Required. Output of the evaluated model.
  optional string prediction = 1 [(google.api.field_behavior) = REQUIRED];

  // Required. Background information provided in context used to compare
  // against the prediction.
  optional string context = 2 [(google.api.field_behavior) = REQUIRED];
}

// Spec for groundedness metric.
message GroundednessSpec {
  // Optional. Which version to use for evaluation.
  int32 version = 1 [(google.api.field_behavior) = OPTIONAL];
}

// Spec for groundedness result.
message GroundednessResult {
  // Output only. Groundedness score.
  optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY];

  // Output only. Explanation for groundedness score.
  string explanation = 2 [(google.api.field_behavior) = OUTPUT_ONLY];

  // Output only. Confidence for groundedness score.
  optional float confidence = 3 [(google.api.field_behavior) = OUTPUT_ONLY];
}

// Input for fulfillment metric.
message FulfillmentInput {
  // Required. Spec for fulfillment score metric.
  FulfillmentSpec metric_spec = 1 [(google.api.field_behavior) = REQUIRED];

  // Required. Fulfillment instance.
  FulfillmentInstance instance = 2 [(google.api.field_behavior) = REQUIRED];
}

// Spec for fulfillment instance.
message FulfillmentInstance {
  // Required. Output of the evaluated model.
  optional string prediction = 1 [(google.api.field_behavior) = REQUIRED];

  // Required. Inference instruction prompt to compare prediction with.
  optional string instruction = 2 [(google.api.field_behavior) = REQUIRED];
}

// Spec for fulfillment metric.
message FulfillmentSpec {
  // Optional. Which version to use for evaluation.
  int32 version = 1 [(google.api.field_behavior) = OPTIONAL];
}

// Spec for fulfillment result.
message FulfillmentResult {
  // Output only. Fulfillment score.
  optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY];

  // Output only. Explanation for fulfillment score.
  string explanation = 2 [(google.api.field_behavior) = OUTPUT_ONLY];

  // Output only. Confidence for fulfillment score.
  optional float confidence = 3 [(google.api.field_behavior) = OUTPUT_ONLY];
}

// Input for summarization quality metric.
message SummarizationQualityInput {
  // Required. Spec for summarization quality score metric.
  SummarizationQualitySpec metric_spec = 1
      [(google.api.field_behavior) = REQUIRED];

  // Required. Summarization quality instance.
  SummarizationQualityInstance instance = 2
      [(google.api.field_behavior) = REQUIRED];
}

// Spec for summarization quality instance.
message SummarizationQualityInstance {
  // Required. Output of the evaluated model.
  optional string prediction = 1 [(google.api.field_behavior) = REQUIRED];

  // Optional. Ground truth used to compare against the prediction.
  optional string reference = 2 [(google.api.field_behavior) = OPTIONAL];

  // Required. Text to be summarized.
  optional string context = 3 [(google.api.field_behavior) = REQUIRED];

  // Required. Summarization prompt for LLM.
  optional string instruction = 4 [(google.api.field_behavior) = REQUIRED];
}

// Spec for summarization quality score metric.
message SummarizationQualitySpec {
  // Optional. Whether to use instance.reference to compute summarization
  // quality.
  bool use_reference = 1 [(google.api.field_behavior) = OPTIONAL];

  // Optional. Which version to use for evaluation.
  int32 version = 2 [(google.api.field_behavior) = OPTIONAL];
}

// Spec for summarization quality result.
message SummarizationQualityResult {
  // Output only. Summarization Quality score.
  optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY];

  // Output only. Explanation for summarization quality score.
  string explanation = 2 [(google.api.field_behavior) = OUTPUT_ONLY];

  // Output only. Confidence for summarization quality score.
  optional float confidence = 3 [(google.api.field_behavior) = OUTPUT_ONLY];
}

// Input for pairwise summarization quality metric.
message PairwiseSummarizationQualityInput {
  // Required. Spec for pairwise summarization quality score metric.
  PairwiseSummarizationQualitySpec metric_spec = 1
      [(google.api.field_behavior) = REQUIRED];

  // Required. Pairwise summarization quality instance.
  PairwiseSummarizationQualityInstance instance = 2
      [(google.api.field_behavior) = REQUIRED];
}

// Spec for pairwise summarization quality instance.
message PairwiseSummarizationQualityInstance {
  // Required. Output of the candidate model.
  optional string prediction = 1 [(google.api.field_behavior) = REQUIRED];

  // Required. Output of the baseline model.
  optional string baseline_prediction = 2
      [(google.api.field_behavior) = REQUIRED];

  // Optional. Ground truth used to compare against the prediction.
  optional string reference = 3 [(google.api.field_behavior) = OPTIONAL];

  // Required. Text to be summarized.
  optional string context = 4 [(google.api.field_behavior) = REQUIRED];

  // Required. Summarization prompt for LLM.
  optional string instruction = 5 [(google.api.field_behavior) = REQUIRED];
}

// Spec for pairwise summarization quality score metric.
message PairwiseSummarizationQualitySpec {
  // Optional. Whether to use instance.reference to compute pairwise
  // summarization quality.
  bool use_reference = 1 [(google.api.field_behavior) = OPTIONAL];

  // Optional. Which version to use for evaluation.
  int32 version = 2 [(google.api.field_behavior) = OPTIONAL];
}

// Spec for pairwise summarization quality result.
message PairwiseSummarizationQualityResult {
  // Output only. Pairwise summarization prediction choice.
  PairwiseChoice pairwise_choice = 1
      [(google.api.field_behavior) = OUTPUT_ONLY];

  // Output only. Explanation for summarization quality score.
  string explanation = 2 [(google.api.field_behavior) = OUTPUT_ONLY];

  // Output only. Confidence for summarization quality score.
  optional float confidence = 3 [(google.api.field_behavior) = OUTPUT_ONLY];
}

// Input for summarization helpfulness metric.
message SummarizationHelpfulnessInput {
  // Required. Spec for summarization helpfulness score metric.
  SummarizationHelpfulnessSpec metric_spec = 1
      [(google.api.field_behavior) = REQUIRED];

  // Required. Summarization helpfulness instance.
  SummarizationHelpfulnessInstance instance = 2
      [(google.api.field_behavior) = REQUIRED];
}

// Spec for summarization helpfulness instance.
message SummarizationHelpfulnessInstance {
  // Required. Output of the evaluated model.
  optional string prediction = 1 [(google.api.field_behavior) = REQUIRED];

  // Optional. Ground truth used to compare against the prediction.
  optional string reference = 2 [(google.api.field_behavior) = OPTIONAL];

  // Required. Text to be summarized.
  optional string context = 3 [(google.api.field_behavior) = REQUIRED];

  // Optional. Summarization prompt for LLM.
  optional string instruction = 4 [(google.api.field_behavior) = OPTIONAL];
}

// Spec for summarization helpfulness score metric.
message SummarizationHelpfulnessSpec {
  // Optional. Whether to use instance.reference to compute summarization
  // helpfulness.
  bool use_reference = 1 [(google.api.field_behavior) = OPTIONAL];

  // Optional. Which version to use for evaluation.
  int32 version = 2 [(google.api.field_behavior) = OPTIONAL];
}

// Spec for summarization helpfulness result.
message SummarizationHelpfulnessResult {
  // Output only. Summarization Helpfulness score.
  optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY];

  // Output only. Explanation for summarization helpfulness score.
  string explanation = 2 [(google.api.field_behavior) = OUTPUT_ONLY];

  // Output only. Confidence for summarization helpfulness score.
  optional float confidence = 3 [(google.api.field_behavior) = OUTPUT_ONLY];
}

// Input for summarization verbosity metric.
message SummarizationVerbosityInput {
  // Required. Spec for summarization verbosity score metric.
  SummarizationVerbositySpec metric_spec = 1
      [(google.api.field_behavior) = REQUIRED];

  // Required. Summarization verbosity instance.
  SummarizationVerbosityInstance instance = 2
      [(google.api.field_behavior) = REQUIRED];
}

// Spec for summarization verbosity instance.
message SummarizationVerbosityInstance {
  // Required. Output of the evaluated model.
  optional string prediction = 1 [(google.api.field_behavior) = REQUIRED];

  // Optional. Ground truth used to compare against the prediction.
  optional string reference = 2 [(google.api.field_behavior) = OPTIONAL];

  // Required. Text to be summarized.
  optional string context = 3 [(google.api.field_behavior) = REQUIRED];

  // Optional. Summarization prompt for LLM.
  optional string instruction = 4 [(google.api.field_behavior) = OPTIONAL];
}

// Spec for summarization verbosity score metric.
message SummarizationVerbositySpec {
  // Optional. Whether to use instance.reference to compute summarization
  // verbosity.
  bool use_reference = 1 [(google.api.field_behavior) = OPTIONAL];

  // Optional. Which version to use for evaluation.
  int32 version = 2 [(google.api.field_behavior) = OPTIONAL];
}

// Spec for summarization verbosity result.
message SummarizationVerbosityResult {
  // Output only. Summarization Verbosity score.
  optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY];

  // Output only. Explanation for summarization verbosity score.
  string explanation = 2 [(google.api.field_behavior) = OUTPUT_ONLY];

  // Output only. Confidence for summarization verbosity score.
  optional float confidence = 3 [(google.api.field_behavior) = OUTPUT_ONLY];
}

// Input for question answering quality metric.
message QuestionAnsweringQualityInput {
  // Required. Spec for question answering quality score metric.
  QuestionAnsweringQualitySpec metric_spec = 1
      [(google.api.field_behavior) = REQUIRED];

  // Required. Question answering quality instance.
  QuestionAnsweringQualityInstance instance = 2
      [(google.api.field_behavior) = REQUIRED];
}

// Spec for question answering quality instance.
message QuestionAnsweringQualityInstance {
  // Required. Output of the evaluated model.
  optional string prediction = 1 [(google.api.field_behavior) = REQUIRED];

  // Optional. Ground truth used to compare against the prediction.
  optional string reference = 2 [(google.api.field_behavior) = OPTIONAL];

  // Required. Text to answer the question.
  optional string context = 3 [(google.api.field_behavior) = REQUIRED];

  // Required. Question Answering prompt for LLM.
  optional string instruction = 4 [(google.api.field_behavior) = REQUIRED];
}

// Spec for question answering quality score metric.
message QuestionAnsweringQualitySpec {
  // Optional. Whether to use instance.reference to compute question answering
  // quality.
  bool use_reference = 1 [(google.api.field_behavior) = OPTIONAL];

  // Optional. Which version to use for evaluation.
  int32 version = 2 [(google.api.field_behavior) = OPTIONAL];
}

// Spec for question answering quality result.
message QuestionAnsweringQualityResult {
  // Output only. Question Answering Quality score.
  optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY];

  // Output only. Explanation for question answering quality score.
  string explanation = 2 [(google.api.field_behavior) = OUTPUT_ONLY];

  // Output only. Confidence for question answering quality score.
  optional float confidence = 3 [(google.api.field_behavior) = OUTPUT_ONLY];
}

// Input for pairwise question answering quality metric.
message PairwiseQuestionAnsweringQualityInput {
  // Required. Spec for pairwise question answering quality score metric.
  PairwiseQuestionAnsweringQualitySpec metric_spec = 1
      [(google.api.field_behavior) = REQUIRED];

  // Required. Pairwise question answering quality instance.
  PairwiseQuestionAnsweringQualityInstance instance = 2
      [(google.api.field_behavior) = REQUIRED];
}

// Spec for pairwise question answering quality instance.
message PairwiseQuestionAnsweringQualityInstance {
  // Required. Output of the candidate model.
  optional string prediction = 1 [(google.api.field_behavior) = REQUIRED];

  // Required. Output of the baseline model.
  optional string baseline_prediction = 2
      [(google.api.field_behavior) = REQUIRED];

  // Optional. Ground truth used to compare against the prediction.
  optional string reference = 3 [(google.api.field_behavior) = OPTIONAL];

  // Required. Text to answer the question.
  optional string context = 4 [(google.api.field_behavior) = REQUIRED];

  // Required. Question Answering prompt for LLM.
  optional string instruction = 5 [(google.api.field_behavior) = REQUIRED];
}

// Spec for pairwise question answering quality score metric.
message PairwiseQuestionAnsweringQualitySpec {
  // Optional. Whether to use instance.reference to compute question answering
  // quality.
  bool use_reference = 1 [(google.api.field_behavior) = OPTIONAL];

  // Optional. Which version to use for evaluation.
  int32 version = 2 [(google.api.field_behavior) = OPTIONAL];
}

// Spec for pairwise question answering quality result.
message PairwiseQuestionAnsweringQualityResult {
  // Output only. Pairwise question answering prediction choice.
  PairwiseChoice pairwise_choice = 1
      [(google.api.field_behavior) = OUTPUT_ONLY];

  // Output only. Explanation for question answering quality score.
  string explanation = 2 [(google.api.field_behavior) = OUTPUT_ONLY];

  // Output only. Confidence for question answering quality score.
  optional float confidence = 3 [(google.api.field_behavior) = OUTPUT_ONLY];
}

// Input for question answering relevance metric.
message QuestionAnsweringRelevanceInput {
  // Required. Spec for question answering relevance score metric.
  QuestionAnsweringRelevanceSpec metric_spec = 1
      [(google.api.field_behavior) = REQUIRED];

  // Required. Question answering relevance instance.
  QuestionAnsweringRelevanceInstance instance = 2
      [(google.api.field_behavior) = REQUIRED];
}

// Spec for question answering relevance instance.
message QuestionAnsweringRelevanceInstance {
  // Required. Output of the evaluated model.
  optional string prediction = 1 [(google.api.field_behavior) = REQUIRED];

  // Optional. Ground truth used to compare against the prediction.
  optional string reference = 2 [(google.api.field_behavior) = OPTIONAL];

  // Optional. Text provided as context to answer the question.
  optional string context = 3 [(google.api.field_behavior) = OPTIONAL];

  // Required. The question asked and other instruction in the inference prompt.
  optional string instruction = 4 [(google.api.field_behavior) = REQUIRED];
}

// Spec for question answering relevance metric.
message QuestionAnsweringRelevanceSpec {
  // Optional. Whether to use instance.reference to compute question answering
  // relevance.
  bool use_reference = 1 [(google.api.field_behavior) = OPTIONAL];

  // Optional. Which version to use for evaluation.
  int32 version = 2 [(google.api.field_behavior) = OPTIONAL];
}

// Spec for question answering relevance result.
message QuestionAnsweringRelevanceResult {
  // Output only. Question Answering Relevance score.
  optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY];

  // Output only. Explanation for question answering relevance score.
  string explanation = 2 [(google.api.field_behavior) = OUTPUT_ONLY];

  // Output only. Confidence for question answering relevance score.
  optional float confidence = 3 [(google.api.field_behavior) = OUTPUT_ONLY];
}

// Input for question answering helpfulness metric.
message QuestionAnsweringHelpfulnessInput {
  // Required. Spec for question answering helpfulness score metric.
  QuestionAnsweringHelpfulnessSpec metric_spec = 1
      [(google.api.field_behavior) = REQUIRED];

  // Required. Question answering helpfulness instance.
  QuestionAnsweringHelpfulnessInstance instance = 2
      [(google.api.field_behavior) = REQUIRED];
}

// Spec for question answering helpfulness instance.
message QuestionAnsweringHelpfulnessInstance {
  // Required. Output of the evaluated model.
  optional string prediction = 1 [(google.api.field_behavior) = REQUIRED];

  // Optional. Ground truth used to compare against the prediction.
  optional string reference = 2 [(google.api.field_behavior) = OPTIONAL];

  // Optional. Text provided as context to answer the question.
  optional string context = 3 [(google.api.field_behavior) = OPTIONAL];

  // Required. The question asked and other instruction in the inference prompt.
  optional string instruction = 4 [(google.api.field_behavior) = REQUIRED];
}

// Spec for question answering helpfulness metric.
message QuestionAnsweringHelpfulnessSpec {
  // Optional. Whether to use instance.reference to compute question answering
  // helpfulness.
  bool use_reference = 1 [(google.api.field_behavior) = OPTIONAL];

  // Optional. Which version to use for evaluation.
  int32 version = 2 [(google.api.field_behavior) = OPTIONAL];
}

// Spec for question answering helpfulness result.
message QuestionAnsweringHelpfulnessResult {
  // Output only. Question Answering Helpfulness score.
  optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY];

  // Output only. Explanation for question answering helpfulness score.
  string explanation = 2 [(google.api.field_behavior) = OUTPUT_ONLY];

  // Output only. Confidence for question answering helpfulness score.
  optional float confidence = 3 [(google.api.field_behavior) = OUTPUT_ONLY];
}

// Input for question answering correctness metric.
message QuestionAnsweringCorrectnessInput {
  // Required. Spec for question answering correctness score metric.
  QuestionAnsweringCorrectnessSpec metric_spec = 1
      [(google.api.field_behavior) = REQUIRED];

  // Required. Question answering correctness instance.
  QuestionAnsweringCorrectnessInstance instance = 2
      [(google.api.field_behavior) = REQUIRED];
}

// Spec for question answering correctness instance.
message QuestionAnsweringCorrectnessInstance {
  // Required. Output of the evaluated model.
  optional string prediction = 1 [(google.api.field_behavior) = REQUIRED];

  // Optional. Ground truth used to compare against the prediction.
  optional string reference = 2 [(google.api.field_behavior) = OPTIONAL];

  // Optional. Text provided as context to answer the question.
  optional string context = 3 [(google.api.field_behavior) = OPTIONAL];

  // Required. The question asked and other instruction in the inference prompt.
  optional string instruction = 4 [(google.api.field_behavior) = REQUIRED];
}

// Spec for question answering correctness metric.
message QuestionAnsweringCorrectnessSpec {
  // Optional. Whether to use instance.reference to compute question answering
  // correctness.
  bool use_reference = 1 [(google.api.field_behavior) = OPTIONAL];

  // Optional. Which version to use for evaluation.
  int32 version = 2 [(google.api.field_behavior) = OPTIONAL];
}

// Spec for question answering correctness result.
message QuestionAnsweringCorrectnessResult {
  // Output only. Question Answering Correctness score.
  optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY];

  // Output only. Explanation for question answering correctness score.
  string explanation = 2 [(google.api.field_behavior) = OUTPUT_ONLY];

  // Output only. Confidence for question answering correctness score.
  optional float confidence = 3 [(google.api.field_behavior) = OUTPUT_ONLY];
}

// Input for tool call valid metric.
message ToolCallValidInput {
  // Required. Spec for tool call valid metric.
  ToolCallValidSpec metric_spec = 1 [(google.api.field_behavior) = REQUIRED];

  // Required. Repeated tool call valid instances.
  repeated ToolCallValidInstance instances = 2
      [(google.api.field_behavior) = REQUIRED];
}

// Spec for tool call valid metric.
message ToolCallValidSpec {}

// Spec for tool call valid instance.
message ToolCallValidInstance {
  // Required. Output of the evaluated model.
  optional string prediction = 1 [(google.api.field_behavior) = REQUIRED];

  // Required. Ground truth used to compare against the prediction.
  optional string reference = 2 [(google.api.field_behavior) = REQUIRED];
}

// Results for tool call valid metric.
message ToolCallValidResults {
  // Output only. Tool call valid metric values.
  repeated ToolCallValidMetricValue tool_call_valid_metric_values = 1
      [(google.api.field_behavior) = OUTPUT_ONLY];
}

// Tool call valid metric value for an instance.
message ToolCallValidMetricValue {
  // Output only. Tool call valid score.
  optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
}

// Input for tool name match metric.
message ToolNameMatchInput {
  // Required. Spec for tool name match metric.
  ToolNameMatchSpec metric_spec = 1 [(google.api.field_behavior) = REQUIRED];

  // Required. Repeated tool name match instances.
  repeated ToolNameMatchInstance instances = 2
      [(google.api.field_behavior) = REQUIRED];
}

// Spec for tool name match metric.
message ToolNameMatchSpec {}

// Spec for tool name match instance.
message ToolNameMatchInstance {
  // Required. Output of the evaluated model.
  optional string prediction = 1 [(google.api.field_behavior) = REQUIRED];

  // Required. Ground truth used to compare against the prediction.
  optional string reference = 2 [(google.api.field_behavior) = REQUIRED];
}

// Results for tool name match metric.
message ToolNameMatchResults {
  // Output only. Tool name match metric values.
  repeated ToolNameMatchMetricValue tool_name_match_metric_values = 1
      [(google.api.field_behavior) = OUTPUT_ONLY];
}

// Tool name match metric value for an instance.
message ToolNameMatchMetricValue {
  // Output only. Tool name match score.
  optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
}

// Input for tool parameter key match metric.
message ToolParameterKeyMatchInput {
  // Required. Spec for tool parameter key match metric.
  ToolParameterKeyMatchSpec metric_spec = 1
      [(google.api.field_behavior) = REQUIRED];

  // Required. Repeated tool parameter key match instances.
  repeated ToolParameterKeyMatchInstance instances = 2
      [(google.api.field_behavior) = REQUIRED];
}

// Spec for tool parameter key match metric.
message ToolParameterKeyMatchSpec {}

// Spec for tool parameter key match instance.
message ToolParameterKeyMatchInstance {
  // Required. Output of the evaluated model.
  optional string prediction = 1 [(google.api.field_behavior) = REQUIRED];

  // Required. Ground truth used to compare against the prediction.
  optional string reference = 2 [(google.api.field_behavior) = REQUIRED];
}

// Results for tool parameter key match metric.
message ToolParameterKeyMatchResults {
  // Output only. Tool parameter key match metric values.
  repeated ToolParameterKeyMatchMetricValue
      tool_parameter_key_match_metric_values = 1
      [(google.api.field_behavior) = OUTPUT_ONLY];
}

// Tool parameter key match metric value for an instance.
message ToolParameterKeyMatchMetricValue {
  // Output only. Tool parameter key match score.
  optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
}

// Input for tool parameter key value match metric.
message ToolParameterKVMatchInput {
  // Required. Spec for tool parameter key value match metric.
  ToolParameterKVMatchSpec metric_spec = 1
      [(google.api.field_behavior) = REQUIRED];

  // Required. Repeated tool parameter key value match instances.
  repeated ToolParameterKVMatchInstance instances = 2
      [(google.api.field_behavior) = REQUIRED];
}

// Spec for tool parameter key value match metric.
message ToolParameterKVMatchSpec {
  // Optional. Whether to use STRCIT string match on parameter values.
  bool use_strict_string_match = 1 [(google.api.field_behavior) = OPTIONAL];
}

// Spec for tool parameter key value match instance.
message ToolParameterKVMatchInstance {
  // Required. Output of the evaluated model.
  optional string prediction = 1 [(google.api.field_behavior) = REQUIRED];

  // Required. Ground truth used to compare against the prediction.
  optional string reference = 2 [(google.api.field_behavior) = REQUIRED];
}

// Results for tool parameter key value match metric.
message ToolParameterKVMatchResults {
  // Output only. Tool parameter key value match metric values.
  repeated ToolParameterKVMatchMetricValue
      tool_parameter_kv_match_metric_values = 1
      [(google.api.field_behavior) = OUTPUT_ONLY];
}

// Tool parameter key value match metric value for an instance.
message ToolParameterKVMatchMetricValue {
  // Output only. Tool parameter key value match score.
  optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
}