xref: /aosp_15_r20/external/googleapis/google/cloud/aiplatform/v1beta1/evaluation_service.proto (revision d5c09012810ac0c9f33fe448fb6da8260d444cc9)
1// Copyright 2023 Google LLC
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15syntax = "proto3";
16
17package google.cloud.aiplatform.v1beta1;
18
19import "google/api/annotations.proto";
20import "google/api/client.proto";
21import "google/api/field_behavior.proto";
22import "google/api/resource.proto";
23
24option csharp_namespace = "Google.Cloud.AIPlatform.V1Beta1";
25option go_package = "cloud.google.com/go/aiplatform/apiv1beta1/aiplatformpb;aiplatformpb";
26option java_multiple_files = true;
27option java_outer_classname = "EvaluationServiceProto";
28option java_package = "com.google.cloud.aiplatform.v1beta1";
29option php_namespace = "Google\\Cloud\\AIPlatform\\V1beta1";
30option ruby_package = "Google::Cloud::AIPlatform::V1beta1";
31
32// Vertex AI Online Evaluation Service.
33service EvaluationService {
34  option (google.api.default_host) = "aiplatform.googleapis.com";
35  option (google.api.oauth_scopes) =
36      "https://www.googleapis.com/auth/cloud-platform";
37
38  // Evaluates instances based on a given metric.
39  rpc EvaluateInstances(EvaluateInstancesRequest)
40      returns (EvaluateInstancesResponse) {
41    option (google.api.http) = {
42      post: "/v1beta1/{location=projects/*/locations/*}:evaluateInstances"
43      body: "*"
44    };
45  }
46}
47
48// Pairwise prediction autorater preference.
49enum PairwiseChoice {
50  // Unspecified prediction choice.
51  PAIRWISE_CHOICE_UNSPECIFIED = 0;
52
53  // Baseline prediction wins
54  BASELINE = 1;
55
56  // Candidate prediction wins
57  CANDIDATE = 2;
58
59  // Winner cannot be determined
60  TIE = 3;
61}
62
63// Request message for EvaluationService.EvaluateInstances.
64message EvaluateInstancesRequest {
65  // Instances and specs for evaluation
66  oneof metric_inputs {
67    // Auto metric instances.
68    // Instances and metric spec for exact match metric.
69    ExactMatchInput exact_match_input = 2;
70
71    // Instances and metric spec for bleu metric.
72    BleuInput bleu_input = 3;
73
74    // Instances and metric spec for rouge metric.
75    RougeInput rouge_input = 4;
76
77    // LLM-based metric instance.
78    // General text generation metrics, applicable to other categories.
79    // Input for fluency metric.
80    FluencyInput fluency_input = 5;
81
82    // Input for coherence metric.
83    CoherenceInput coherence_input = 6;
84
85    // Input for safety metric.
86    SafetyInput safety_input = 8;
87
88    // Input for groundedness metric.
89    GroundednessInput groundedness_input = 9;
90
91    // Input for fulfillment metric.
92    FulfillmentInput fulfillment_input = 12;
93
94    // Input for summarization quality metric.
95    SummarizationQualityInput summarization_quality_input = 7;
96
97    // Input for pairwise summarization quality metric.
98    PairwiseSummarizationQualityInput pairwise_summarization_quality_input = 23;
99
100    // Input for summarization helpfulness metric.
101    SummarizationHelpfulnessInput summarization_helpfulness_input = 14;
102
103    // Input for summarization verbosity metric.
104    SummarizationVerbosityInput summarization_verbosity_input = 15;
105
106    // Input for question answering quality metric.
107    QuestionAnsweringQualityInput question_answering_quality_input = 10;
108
109    // Input for pairwise question answering quality metric.
110    PairwiseQuestionAnsweringQualityInput
111        pairwise_question_answering_quality_input = 24;
112
113    // Input for question answering relevance metric.
114    QuestionAnsweringRelevanceInput question_answering_relevance_input = 16;
115
116    // Input for question answering helpfulness
117    // metric.
118    QuestionAnsweringHelpfulnessInput question_answering_helpfulness_input = 17;
119
120    // Input for question answering correctness
121    // metric.
122    QuestionAnsweringCorrectnessInput question_answering_correctness_input = 18;
123
124    // Tool call metric instances.
125    // Input for tool call valid metric.
126    ToolCallValidInput tool_call_valid_input = 19;
127
128    // Input for tool name match metric.
129    ToolNameMatchInput tool_name_match_input = 20;
130
131    // Input for tool parameter key match metric.
132    ToolParameterKeyMatchInput tool_parameter_key_match_input = 21;
133
134    // Input for tool parameter key value match metric.
135    ToolParameterKVMatchInput tool_parameter_kv_match_input = 22;
136  }
137
138  // Required. The resource name of the Location to evaluate the instances.
139  // Format: `projects/{project}/locations/{location}`
140  string location = 1 [
141    (google.api.field_behavior) = REQUIRED,
142    (google.api.resource_reference) = {
143      type: "locations.googleapis.com/Location"
144    }
145  ];
146}
147
148// Response message for EvaluationService.EvaluateInstances.
149message EvaluateInstancesResponse {
150  // Evaluation results will be served in the same order as presented in
151  // EvaluationRequest.instances.
152  oneof evaluation_results {
153    // Auto metric evaluation results.
154    // Results for exact match metric.
155    ExactMatchResults exact_match_results = 1;
156
157    // Results for bleu metric.
158    BleuResults bleu_results = 2;
159
160    // Results for rouge metric.
161    RougeResults rouge_results = 3;
162
163    // LLM-based metric evaluation result.
164    // General text generation metrics, applicable to other categories.
165    // Result for fluency metric.
166    FluencyResult fluency_result = 4;
167
168    // Result for coherence metric.
169    CoherenceResult coherence_result = 5;
170
171    // Result for safety metric.
172    SafetyResult safety_result = 7;
173
174    // Result for groundedness metric.
175    GroundednessResult groundedness_result = 8;
176
177    // Result for fulfillment metric.
178    FulfillmentResult fulfillment_result = 11;
179
180    // Summarization only metrics.
181    // Result for summarization quality metric.
182    SummarizationQualityResult summarization_quality_result = 6;
183
184    // Result for pairwise summarization quality metric.
185    PairwiseSummarizationQualityResult pairwise_summarization_quality_result =
186        22;
187
188    // Result for summarization helpfulness metric.
189    SummarizationHelpfulnessResult summarization_helpfulness_result = 13;
190
191    // Result for summarization verbosity metric.
192    SummarizationVerbosityResult summarization_verbosity_result = 14;
193
194    // Question answering only metrics.
195    // Result for question answering quality metric.
196    QuestionAnsweringQualityResult question_answering_quality_result = 9;
197
198    // Result for pairwise question answering quality metric.
199    PairwiseQuestionAnsweringQualityResult
200        pairwise_question_answering_quality_result = 23;
201
202    // Result for question answering relevance metric.
203    QuestionAnsweringRelevanceResult question_answering_relevance_result = 15;
204
205    // Result for question answering helpfulness metric.
206    QuestionAnsweringHelpfulnessResult question_answering_helpfulness_result =
207        16;
208
209    // Result for question answering correctness metric.
210    QuestionAnsweringCorrectnessResult question_answering_correctness_result =
211        17;
212
213    // Tool call metrics.
214    //  Results for tool call valid metric.
215    ToolCallValidResults tool_call_valid_results = 18;
216
217    // Results for tool name match metric.
218    ToolNameMatchResults tool_name_match_results = 19;
219
220    // Results for tool parameter key match  metric.
221    ToolParameterKeyMatchResults tool_parameter_key_match_results = 20;
222
223    // Results for tool parameter key value match metric.
224    ToolParameterKVMatchResults tool_parameter_kv_match_results = 21;
225  }
226}
227
228// Input for exact match metric.
229message ExactMatchInput {
230  // Required. Spec for exact match metric.
231  ExactMatchSpec metric_spec = 1 [(google.api.field_behavior) = REQUIRED];
232
233  // Required. Repeated exact match instances.
234  repeated ExactMatchInstance instances = 2
235      [(google.api.field_behavior) = REQUIRED];
236}
237
238// Spec for exact match instance.
239message ExactMatchInstance {
240  // Required. Output of the evaluated model.
241  optional string prediction = 1 [(google.api.field_behavior) = REQUIRED];
242
243  // Required. Ground truth used to compare against the prediction.
244  optional string reference = 2 [(google.api.field_behavior) = REQUIRED];
245}
246
247// Spec for exact match metric - returns 1 if prediction and reference exactly
248// matches, otherwise 0.
249message ExactMatchSpec {}
250
251// Results for exact match metric.
252message ExactMatchResults {
253  // Output only. Exact match metric values.
254  repeated ExactMatchMetricValue exact_match_metric_values = 1
255      [(google.api.field_behavior) = OUTPUT_ONLY];
256}
257
258// Exact match metric value for an instance.
259message ExactMatchMetricValue {
260  // Output only. Exact match score.
261  optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
262}
263
264// Input for bleu metric.
265message BleuInput {
266  // Required. Spec for bleu score metric.
267  BleuSpec metric_spec = 1 [(google.api.field_behavior) = REQUIRED];
268
269  // Required. Repeated bleu instances.
270  repeated BleuInstance instances = 2 [(google.api.field_behavior) = REQUIRED];
271}
272
273// Spec for bleu instance.
274message BleuInstance {
275  // Required. Output of the evaluated model.
276  optional string prediction = 1 [(google.api.field_behavior) = REQUIRED];
277
278  // Required. Ground truth used to compare against the prediction.
279  optional string reference = 2 [(google.api.field_behavior) = REQUIRED];
280}
281
282// Spec for bleu score metric - calculates the precision of n-grams in the
283// prediction as compared to reference - returns a score ranging between 0 to 1.
284message BleuSpec {}
285
286// Results for bleu metric.
287message BleuResults {
288  // Output only. Bleu metric values.
289  repeated BleuMetricValue bleu_metric_values = 1
290      [(google.api.field_behavior) = OUTPUT_ONLY];
291}
292
293// Bleu metric value for an instance.
294message BleuMetricValue {
295  // Output only. Bleu score.
296  optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
297}
298
299// Input for rouge metric.
300message RougeInput {
301  // Required. Spec for rouge score metric.
302  RougeSpec metric_spec = 1 [(google.api.field_behavior) = REQUIRED];
303
304  // Required. Repeated rouge instances.
305  repeated RougeInstance instances = 2 [(google.api.field_behavior) = REQUIRED];
306}
307
308// Spec for rouge instance.
309message RougeInstance {
310  // Required. Output of the evaluated model.
311  optional string prediction = 1 [(google.api.field_behavior) = REQUIRED];
312
313  // Required. Ground truth used to compare against the prediction.
314  optional string reference = 2 [(google.api.field_behavior) = REQUIRED];
315}
316
317// Spec for rouge score metric - calculates the recall of n-grams in prediction
318// as compared to reference - returns a score ranging between 0 and 1.
319message RougeSpec {
320  // Optional. Supported rouge types are rougen[1-9], rougeL and rougeLsum.
321  string rouge_type = 1 [(google.api.field_behavior) = OPTIONAL];
322
323  // Optional. Whether to use stemmer to compute rouge score.
324  bool use_stemmer = 2 [(google.api.field_behavior) = OPTIONAL];
325
326  // Optional. Whether to split summaries while using rougeLsum.
327  bool split_summaries = 3 [(google.api.field_behavior) = OPTIONAL];
328}
329
330// Results for rouge metric.
331message RougeResults {
332  // Output only. Rouge metric values.
333  repeated RougeMetricValue rouge_metric_values = 1
334      [(google.api.field_behavior) = OUTPUT_ONLY];
335}
336
337// Rouge metric value for an instance.
338message RougeMetricValue {
339  // Output only. Rouge score.
340  optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
341}
342
343// Input for coherence metric.
344message CoherenceInput {
345  // Required. Spec for coherence score metric.
346  CoherenceSpec metric_spec = 1 [(google.api.field_behavior) = REQUIRED];
347
348  // Required. Coherence instance.
349  CoherenceInstance instance = 2 [(google.api.field_behavior) = REQUIRED];
350}
351
352// Spec for coherence instance.
353message CoherenceInstance {
354  // Required. Output of the evaluated model.
355  optional string prediction = 1 [(google.api.field_behavior) = REQUIRED];
356}
357
358// Spec for coherence score metric.
359message CoherenceSpec {
360  // Optional. Which version to use for evaluation.
361  int32 version = 1 [(google.api.field_behavior) = OPTIONAL];
362}
363
364// Spec for coherence result.
365message CoherenceResult {
366  // Output only. Coherence score.
367  optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
368
369  // Output only. Explanation for coherence score.
370  string explanation = 2 [(google.api.field_behavior) = OUTPUT_ONLY];
371
372  // Output only. Confidence for coherence score.
373  optional float confidence = 3 [(google.api.field_behavior) = OUTPUT_ONLY];
374}
375
376// Input for fluency metric.
377message FluencyInput {
378  // Required. Spec for fluency score metric.
379  FluencySpec metric_spec = 1 [(google.api.field_behavior) = REQUIRED];
380
381  // Required. Fluency instance.
382  FluencyInstance instance = 2 [(google.api.field_behavior) = REQUIRED];
383}
384
385// Spec for fluency instance.
386message FluencyInstance {
387  // Required. Output of the evaluated model.
388  optional string prediction = 1 [(google.api.field_behavior) = REQUIRED];
389}
390
391// Spec for fluency score metric.
392message FluencySpec {
393  // Optional. Which version to use for evaluation.
394  int32 version = 1 [(google.api.field_behavior) = OPTIONAL];
395}
396
397// Spec for fluency result.
398message FluencyResult {
399  // Output only. Fluency score.
400  optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
401
402  // Output only. Explanation for fluency score.
403  string explanation = 2 [(google.api.field_behavior) = OUTPUT_ONLY];
404
405  // Output only. Confidence for fluency score.
406  optional float confidence = 3 [(google.api.field_behavior) = OUTPUT_ONLY];
407}
408
409// Input for safety metric.
410message SafetyInput {
411  // Required. Spec for safety metric.
412  SafetySpec metric_spec = 1 [(google.api.field_behavior) = REQUIRED];
413
414  // Required. Safety instance.
415  SafetyInstance instance = 2 [(google.api.field_behavior) = REQUIRED];
416}
417
418// Spec for safety instance.
419message SafetyInstance {
420  // Required. Output of the evaluated model.
421  optional string prediction = 1 [(google.api.field_behavior) = REQUIRED];
422}
423
424// Spec for safety metric.
425message SafetySpec {
426  // Optional. Which version to use for evaluation.
427  int32 version = 1 [(google.api.field_behavior) = OPTIONAL];
428}
429
430// Spec for safety result.
431message SafetyResult {
432  // Output only. Safety score.
433  optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
434
435  // Output only. Explanation for safety score.
436  string explanation = 2 [(google.api.field_behavior) = OUTPUT_ONLY];
437
438  // Output only. Confidence for safety score.
439  optional float confidence = 3 [(google.api.field_behavior) = OUTPUT_ONLY];
440}
441
442// Input for groundedness metric.
443message GroundednessInput {
444  // Required. Spec for groundedness metric.
445  GroundednessSpec metric_spec = 1 [(google.api.field_behavior) = REQUIRED];
446
447  // Required. Groundedness instance.
448  GroundednessInstance instance = 2 [(google.api.field_behavior) = REQUIRED];
449}
450
451// Spec for groundedness instance.
452message GroundednessInstance {
453  // Required. Output of the evaluated model.
454  optional string prediction = 1 [(google.api.field_behavior) = REQUIRED];
455
456  // Required. Background information provided in context used to compare
457  // against the prediction.
458  optional string context = 2 [(google.api.field_behavior) = REQUIRED];
459}
460
461// Spec for groundedness metric.
462message GroundednessSpec {
463  // Optional. Which version to use for evaluation.
464  int32 version = 1 [(google.api.field_behavior) = OPTIONAL];
465}
466
467// Spec for groundedness result.
468message GroundednessResult {
469  // Output only. Groundedness score.
470  optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
471
472  // Output only. Explanation for groundedness score.
473  string explanation = 2 [(google.api.field_behavior) = OUTPUT_ONLY];
474
475  // Output only. Confidence for groundedness score.
476  optional float confidence = 3 [(google.api.field_behavior) = OUTPUT_ONLY];
477}
478
479// Input for fulfillment metric.
480message FulfillmentInput {
481  // Required. Spec for fulfillment score metric.
482  FulfillmentSpec metric_spec = 1 [(google.api.field_behavior) = REQUIRED];
483
484  // Required. Fulfillment instance.
485  FulfillmentInstance instance = 2 [(google.api.field_behavior) = REQUIRED];
486}
487
488// Spec for fulfillment instance.
489message FulfillmentInstance {
490  // Required. Output of the evaluated model.
491  optional string prediction = 1 [(google.api.field_behavior) = REQUIRED];
492
493  // Required. Inference instruction prompt to compare prediction with.
494  optional string instruction = 2 [(google.api.field_behavior) = REQUIRED];
495}
496
497// Spec for fulfillment metric.
498message FulfillmentSpec {
499  // Optional. Which version to use for evaluation.
500  int32 version = 1 [(google.api.field_behavior) = OPTIONAL];
501}
502
503// Spec for fulfillment result.
504message FulfillmentResult {
505  // Output only. Fulfillment score.
506  optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
507
508  // Output only. Explanation for fulfillment score.
509  string explanation = 2 [(google.api.field_behavior) = OUTPUT_ONLY];
510
511  // Output only. Confidence for fulfillment score.
512  optional float confidence = 3 [(google.api.field_behavior) = OUTPUT_ONLY];
513}
514
515// Input for summarization quality metric.
516message SummarizationQualityInput {
517  // Required. Spec for summarization quality score metric.
518  SummarizationQualitySpec metric_spec = 1
519      [(google.api.field_behavior) = REQUIRED];
520
521  // Required. Summarization quality instance.
522  SummarizationQualityInstance instance = 2
523      [(google.api.field_behavior) = REQUIRED];
524}
525
526// Spec for summarization quality instance.
527message SummarizationQualityInstance {
528  // Required. Output of the evaluated model.
529  optional string prediction = 1 [(google.api.field_behavior) = REQUIRED];
530
531  // Optional. Ground truth used to compare against the prediction.
532  optional string reference = 2 [(google.api.field_behavior) = OPTIONAL];
533
534  // Required. Text to be summarized.
535  optional string context = 3 [(google.api.field_behavior) = REQUIRED];
536
537  // Required. Summarization prompt for LLM.
538  optional string instruction = 4 [(google.api.field_behavior) = REQUIRED];
539}
540
541// Spec for summarization quality score metric.
542message SummarizationQualitySpec {
543  // Optional. Whether to use instance.reference to compute summarization
544  // quality.
545  bool use_reference = 1 [(google.api.field_behavior) = OPTIONAL];
546
547  // Optional. Which version to use for evaluation.
548  int32 version = 2 [(google.api.field_behavior) = OPTIONAL];
549}
550
551// Spec for summarization quality result.
552message SummarizationQualityResult {
553  // Output only. Summarization Quality score.
554  optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
555
556  // Output only. Explanation for summarization quality score.
557  string explanation = 2 [(google.api.field_behavior) = OUTPUT_ONLY];
558
559  // Output only. Confidence for summarization quality score.
560  optional float confidence = 3 [(google.api.field_behavior) = OUTPUT_ONLY];
561}
562
563// Input for pairwise summarization quality metric.
564message PairwiseSummarizationQualityInput {
565  // Required. Spec for pairwise summarization quality score metric.
566  PairwiseSummarizationQualitySpec metric_spec = 1
567      [(google.api.field_behavior) = REQUIRED];
568
569  // Required. Pairwise summarization quality instance.
570  PairwiseSummarizationQualityInstance instance = 2
571      [(google.api.field_behavior) = REQUIRED];
572}
573
574// Spec for pairwise summarization quality instance.
575message PairwiseSummarizationQualityInstance {
576  // Required. Output of the candidate model.
577  optional string prediction = 1 [(google.api.field_behavior) = REQUIRED];
578
579  // Required. Output of the baseline model.
580  optional string baseline_prediction = 2
581      [(google.api.field_behavior) = REQUIRED];
582
583  // Optional. Ground truth used to compare against the prediction.
584  optional string reference = 3 [(google.api.field_behavior) = OPTIONAL];
585
586  // Required. Text to be summarized.
587  optional string context = 4 [(google.api.field_behavior) = REQUIRED];
588
589  // Required. Summarization prompt for LLM.
590  optional string instruction = 5 [(google.api.field_behavior) = REQUIRED];
591}
592
593// Spec for pairwise summarization quality score metric.
594message PairwiseSummarizationQualitySpec {
595  // Optional. Whether to use instance.reference to compute pairwise
596  // summarization quality.
597  bool use_reference = 1 [(google.api.field_behavior) = OPTIONAL];
598
599  // Optional. Which version to use for evaluation.
600  int32 version = 2 [(google.api.field_behavior) = OPTIONAL];
601}
602
603// Spec for pairwise summarization quality result.
604message PairwiseSummarizationQualityResult {
605  // Output only. Pairwise summarization prediction choice.
606  PairwiseChoice pairwise_choice = 1
607      [(google.api.field_behavior) = OUTPUT_ONLY];
608
609  // Output only. Explanation for summarization quality score.
610  string explanation = 2 [(google.api.field_behavior) = OUTPUT_ONLY];
611
612  // Output only. Confidence for summarization quality score.
613  optional float confidence = 3 [(google.api.field_behavior) = OUTPUT_ONLY];
614}
615
616// Input for summarization helpfulness metric.
617message SummarizationHelpfulnessInput {
618  // Required. Spec for summarization helpfulness score metric.
619  SummarizationHelpfulnessSpec metric_spec = 1
620      [(google.api.field_behavior) = REQUIRED];
621
622  // Required. Summarization helpfulness instance.
623  SummarizationHelpfulnessInstance instance = 2
624      [(google.api.field_behavior) = REQUIRED];
625}
626
627// Spec for summarization helpfulness instance.
628message SummarizationHelpfulnessInstance {
629  // Required. Output of the evaluated model.
630  optional string prediction = 1 [(google.api.field_behavior) = REQUIRED];
631
632  // Optional. Ground truth used to compare against the prediction.
633  optional string reference = 2 [(google.api.field_behavior) = OPTIONAL];
634
635  // Required. Text to be summarized.
636  optional string context = 3 [(google.api.field_behavior) = REQUIRED];
637
638  // Optional. Summarization prompt for LLM.
639  optional string instruction = 4 [(google.api.field_behavior) = OPTIONAL];
640}
641
642// Spec for summarization helpfulness score metric.
643message SummarizationHelpfulnessSpec {
644  // Optional. Whether to use instance.reference to compute summarization
645  // helpfulness.
646  bool use_reference = 1 [(google.api.field_behavior) = OPTIONAL];
647
648  // Optional. Which version to use for evaluation.
649  int32 version = 2 [(google.api.field_behavior) = OPTIONAL];
650}
651
652// Spec for summarization helpfulness result.
653message SummarizationHelpfulnessResult {
654  // Output only. Summarization Helpfulness score.
655  optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
656
657  // Output only. Explanation for summarization helpfulness score.
658  string explanation = 2 [(google.api.field_behavior) = OUTPUT_ONLY];
659
660  // Output only. Confidence for summarization helpfulness score.
661  optional float confidence = 3 [(google.api.field_behavior) = OUTPUT_ONLY];
662}
663
664// Input for summarization verbosity metric.
665message SummarizationVerbosityInput {
666  // Required. Spec for summarization verbosity score metric.
667  SummarizationVerbositySpec metric_spec = 1
668      [(google.api.field_behavior) = REQUIRED];
669
670  // Required. Summarization verbosity instance.
671  SummarizationVerbosityInstance instance = 2
672      [(google.api.field_behavior) = REQUIRED];
673}
674
675// Spec for summarization verbosity instance.
676message SummarizationVerbosityInstance {
677  // Required. Output of the evaluated model.
678  optional string prediction = 1 [(google.api.field_behavior) = REQUIRED];
679
680  // Optional. Ground truth used to compare against the prediction.
681  optional string reference = 2 [(google.api.field_behavior) = OPTIONAL];
682
683  // Required. Text to be summarized.
684  optional string context = 3 [(google.api.field_behavior) = REQUIRED];
685
686  // Optional. Summarization prompt for LLM.
687  optional string instruction = 4 [(google.api.field_behavior) = OPTIONAL];
688}
689
690// Spec for summarization verbosity score metric.
691message SummarizationVerbositySpec {
692  // Optional. Whether to use instance.reference to compute summarization
693  // verbosity.
694  bool use_reference = 1 [(google.api.field_behavior) = OPTIONAL];
695
696  // Optional. Which version to use for evaluation.
697  int32 version = 2 [(google.api.field_behavior) = OPTIONAL];
698}
699
700// Spec for summarization verbosity result.
701message SummarizationVerbosityResult {
702  // Output only. Summarization Verbosity score.
703  optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
704
705  // Output only. Explanation for summarization verbosity score.
706  string explanation = 2 [(google.api.field_behavior) = OUTPUT_ONLY];
707
708  // Output only. Confidence for summarization verbosity score.
709  optional float confidence = 3 [(google.api.field_behavior) = OUTPUT_ONLY];
710}
711
712// Input for question answering quality metric.
713message QuestionAnsweringQualityInput {
714  // Required. Spec for question answering quality score metric.
715  QuestionAnsweringQualitySpec metric_spec = 1
716      [(google.api.field_behavior) = REQUIRED];
717
718  // Required. Question answering quality instance.
719  QuestionAnsweringQualityInstance instance = 2
720      [(google.api.field_behavior) = REQUIRED];
721}
722
723// Spec for question answering quality instance.
724message QuestionAnsweringQualityInstance {
725  // Required. Output of the evaluated model.
726  optional string prediction = 1 [(google.api.field_behavior) = REQUIRED];
727
728  // Optional. Ground truth used to compare against the prediction.
729  optional string reference = 2 [(google.api.field_behavior) = OPTIONAL];
730
731  // Required. Text to answer the question.
732  optional string context = 3 [(google.api.field_behavior) = REQUIRED];
733
734  // Required. Question Answering prompt for LLM.
735  optional string instruction = 4 [(google.api.field_behavior) = REQUIRED];
736}
737
738// Spec for question answering quality score metric.
739message QuestionAnsweringQualitySpec {
740  // Optional. Whether to use instance.reference to compute question answering
741  // quality.
742  bool use_reference = 1 [(google.api.field_behavior) = OPTIONAL];
743
744  // Optional. Which version to use for evaluation.
745  int32 version = 2 [(google.api.field_behavior) = OPTIONAL];
746}
747
748// Spec for question answering quality result.
749message QuestionAnsweringQualityResult {
750  // Output only. Question Answering Quality score.
751  optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
752
753  // Output only. Explanation for question answering quality score.
754  string explanation = 2 [(google.api.field_behavior) = OUTPUT_ONLY];
755
756  // Output only. Confidence for question answering quality score.
757  optional float confidence = 3 [(google.api.field_behavior) = OUTPUT_ONLY];
758}
759
760// Input for pairwise question answering quality metric.
761message PairwiseQuestionAnsweringQualityInput {
762  // Required. Spec for pairwise question answering quality score metric.
763  PairwiseQuestionAnsweringQualitySpec metric_spec = 1
764      [(google.api.field_behavior) = REQUIRED];
765
766  // Required. Pairwise question answering quality instance.
767  PairwiseQuestionAnsweringQualityInstance instance = 2
768      [(google.api.field_behavior) = REQUIRED];
769}
770
771// Spec for pairwise question answering quality instance.
772message PairwiseQuestionAnsweringQualityInstance {
773  // Required. Output of the candidate model.
774  optional string prediction = 1 [(google.api.field_behavior) = REQUIRED];
775
776  // Required. Output of the baseline model.
777  optional string baseline_prediction = 2
778      [(google.api.field_behavior) = REQUIRED];
779
780  // Optional. Ground truth used to compare against the prediction.
781  optional string reference = 3 [(google.api.field_behavior) = OPTIONAL];
782
783  // Required. Text to answer the question.
784  optional string context = 4 [(google.api.field_behavior) = REQUIRED];
785
786  // Required. Question Answering prompt for LLM.
787  optional string instruction = 5 [(google.api.field_behavior) = REQUIRED];
788}
789
790// Spec for pairwise question answering quality score metric.
791message PairwiseQuestionAnsweringQualitySpec {
792  // Optional. Whether to use instance.reference to compute question answering
793  // quality.
794  bool use_reference = 1 [(google.api.field_behavior) = OPTIONAL];
795
796  // Optional. Which version to use for evaluation.
797  int32 version = 2 [(google.api.field_behavior) = OPTIONAL];
798}
799
800// Spec for pairwise question answering quality result.
801message PairwiseQuestionAnsweringQualityResult {
802  // Output only. Pairwise question answering prediction choice.
803  PairwiseChoice pairwise_choice = 1
804      [(google.api.field_behavior) = OUTPUT_ONLY];
805
806  // Output only. Explanation for question answering quality score.
807  string explanation = 2 [(google.api.field_behavior) = OUTPUT_ONLY];
808
809  // Output only. Confidence for question answering quality score.
810  optional float confidence = 3 [(google.api.field_behavior) = OUTPUT_ONLY];
811}
812
813// Input for question answering relevance metric.
814message QuestionAnsweringRelevanceInput {
815  // Required. Spec for question answering relevance score metric.
816  QuestionAnsweringRelevanceSpec metric_spec = 1
817      [(google.api.field_behavior) = REQUIRED];
818
819  // Required. Question answering relevance instance.
820  QuestionAnsweringRelevanceInstance instance = 2
821      [(google.api.field_behavior) = REQUIRED];
822}
823
824// Spec for question answering relevance instance.
825message QuestionAnsweringRelevanceInstance {
826  // Required. Output of the evaluated model.
827  optional string prediction = 1 [(google.api.field_behavior) = REQUIRED];
828
829  // Optional. Ground truth used to compare against the prediction.
830  optional string reference = 2 [(google.api.field_behavior) = OPTIONAL];
831
832  // Optional. Text provided as context to answer the question.
833  optional string context = 3 [(google.api.field_behavior) = OPTIONAL];
834
835  // Required. The question asked and other instruction in the inference prompt.
836  optional string instruction = 4 [(google.api.field_behavior) = REQUIRED];
837}
838
839// Spec for question answering relevance metric.
840message QuestionAnsweringRelevanceSpec {
841  // Optional. Whether to use instance.reference to compute question answering
842  // relevance.
843  bool use_reference = 1 [(google.api.field_behavior) = OPTIONAL];
844
845  // Optional. Which version to use for evaluation.
846  int32 version = 2 [(google.api.field_behavior) = OPTIONAL];
847}
848
849// Spec for question answering relevance result.
850message QuestionAnsweringRelevanceResult {
851  // Output only. Question Answering Relevance score.
852  optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
853
854  // Output only. Explanation for question answering relevance score.
855  string explanation = 2 [(google.api.field_behavior) = OUTPUT_ONLY];
856
857  // Output only. Confidence for question answering relevance score.
858  optional float confidence = 3 [(google.api.field_behavior) = OUTPUT_ONLY];
859}
860
861// Input for question answering helpfulness metric.
862message QuestionAnsweringHelpfulnessInput {
863  // Required. Spec for question answering helpfulness score metric.
864  QuestionAnsweringHelpfulnessSpec metric_spec = 1
865      [(google.api.field_behavior) = REQUIRED];
866
867  // Required. Question answering helpfulness instance.
868  QuestionAnsweringHelpfulnessInstance instance = 2
869      [(google.api.field_behavior) = REQUIRED];
870}
871
872// Spec for question answering helpfulness instance.
873message QuestionAnsweringHelpfulnessInstance {
874  // Required. Output of the evaluated model.
875  optional string prediction = 1 [(google.api.field_behavior) = REQUIRED];
876
877  // Optional. Ground truth used to compare against the prediction.
878  optional string reference = 2 [(google.api.field_behavior) = OPTIONAL];
879
880  // Optional. Text provided as context to answer the question.
881  optional string context = 3 [(google.api.field_behavior) = OPTIONAL];
882
883  // Required. The question asked and other instruction in the inference prompt.
884  optional string instruction = 4 [(google.api.field_behavior) = REQUIRED];
885}
886
887// Spec for question answering helpfulness metric.
888message QuestionAnsweringHelpfulnessSpec {
889  // Optional. Whether to use instance.reference to compute question answering
890  // helpfulness.
891  bool use_reference = 1 [(google.api.field_behavior) = OPTIONAL];
892
893  // Optional. Which version to use for evaluation.
894  int32 version = 2 [(google.api.field_behavior) = OPTIONAL];
895}
896
897// Spec for question answering helpfulness result.
898message QuestionAnsweringHelpfulnessResult {
899  // Output only. Question Answering Helpfulness score.
900  optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
901
902  // Output only. Explanation for question answering helpfulness score.
903  string explanation = 2 [(google.api.field_behavior) = OUTPUT_ONLY];
904
905  // Output only. Confidence for question answering helpfulness score.
906  optional float confidence = 3 [(google.api.field_behavior) = OUTPUT_ONLY];
907}
908
909// Input for question answering correctness metric.
910message QuestionAnsweringCorrectnessInput {
911  // Required. Spec for question answering correctness score metric.
912  QuestionAnsweringCorrectnessSpec metric_spec = 1
913      [(google.api.field_behavior) = REQUIRED];
914
915  // Required. Question answering correctness instance.
916  QuestionAnsweringCorrectnessInstance instance = 2
917      [(google.api.field_behavior) = REQUIRED];
918}
919
920// Spec for question answering correctness instance.
921message QuestionAnsweringCorrectnessInstance {
922  // Required. Output of the evaluated model.
923  optional string prediction = 1 [(google.api.field_behavior) = REQUIRED];
924
925  // Optional. Ground truth used to compare against the prediction.
926  optional string reference = 2 [(google.api.field_behavior) = OPTIONAL];
927
928  // Optional. Text provided as context to answer the question.
929  optional string context = 3 [(google.api.field_behavior) = OPTIONAL];
930
931  // Required. The question asked and other instruction in the inference prompt.
932  optional string instruction = 4 [(google.api.field_behavior) = REQUIRED];
933}
934
935// Spec for question answering correctness metric.
936message QuestionAnsweringCorrectnessSpec {
937  // Optional. Whether to use instance.reference to compute question answering
938  // correctness.
939  bool use_reference = 1 [(google.api.field_behavior) = OPTIONAL];
940
941  // Optional. Which version to use for evaluation.
942  int32 version = 2 [(google.api.field_behavior) = OPTIONAL];
943}
944
945// Spec for question answering correctness result.
946message QuestionAnsweringCorrectnessResult {
947  // Output only. Question Answering Correctness score.
948  optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
949
950  // Output only. Explanation for question answering correctness score.
951  string explanation = 2 [(google.api.field_behavior) = OUTPUT_ONLY];
952
953  // Output only. Confidence for question answering correctness score.
954  optional float confidence = 3 [(google.api.field_behavior) = OUTPUT_ONLY];
955}
956
957// Input for tool call valid metric.
958message ToolCallValidInput {
959  // Required. Spec for tool call valid metric.
960  ToolCallValidSpec metric_spec = 1 [(google.api.field_behavior) = REQUIRED];
961
962  // Required. Repeated tool call valid instances.
963  repeated ToolCallValidInstance instances = 2
964      [(google.api.field_behavior) = REQUIRED];
965}
966
967// Spec for tool call valid metric.
968message ToolCallValidSpec {}
969
970// Spec for tool call valid instance.
971message ToolCallValidInstance {
972  // Required. Output of the evaluated model.
973  optional string prediction = 1 [(google.api.field_behavior) = REQUIRED];
974
975  // Required. Ground truth used to compare against the prediction.
976  optional string reference = 2 [(google.api.field_behavior) = REQUIRED];
977}
978
979// Results for tool call valid metric.
980message ToolCallValidResults {
981  // Output only. Tool call valid metric values.
982  repeated ToolCallValidMetricValue tool_call_valid_metric_values = 1
983      [(google.api.field_behavior) = OUTPUT_ONLY];
984}
985
986// Tool call valid metric value for an instance.
987message ToolCallValidMetricValue {
988  // Output only. Tool call valid score.
989  optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
990}
991
992// Input for tool name match metric.
993message ToolNameMatchInput {
994  // Required. Spec for tool name match metric.
995  ToolNameMatchSpec metric_spec = 1 [(google.api.field_behavior) = REQUIRED];
996
997  // Required. Repeated tool name match instances.
998  repeated ToolNameMatchInstance instances = 2
999      [(google.api.field_behavior) = REQUIRED];
1000}
1001
1002// Spec for tool name match metric.
1003message ToolNameMatchSpec {}
1004
1005// Spec for tool name match instance.
1006message ToolNameMatchInstance {
1007  // Required. Output of the evaluated model.
1008  optional string prediction = 1 [(google.api.field_behavior) = REQUIRED];
1009
1010  // Required. Ground truth used to compare against the prediction.
1011  optional string reference = 2 [(google.api.field_behavior) = REQUIRED];
1012}
1013
1014// Results for tool name match metric.
1015message ToolNameMatchResults {
1016  // Output only. Tool name match metric values.
1017  repeated ToolNameMatchMetricValue tool_name_match_metric_values = 1
1018      [(google.api.field_behavior) = OUTPUT_ONLY];
1019}
1020
1021// Tool name match metric value for an instance.
1022message ToolNameMatchMetricValue {
1023  // Output only. Tool name match score.
1024  optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
1025}
1026
1027// Input for tool parameter key match metric.
1028message ToolParameterKeyMatchInput {
1029  // Required. Spec for tool parameter key match metric.
1030  ToolParameterKeyMatchSpec metric_spec = 1
1031      [(google.api.field_behavior) = REQUIRED];
1032
1033  // Required. Repeated tool parameter key match instances.
1034  repeated ToolParameterKeyMatchInstance instances = 2
1035      [(google.api.field_behavior) = REQUIRED];
1036}
1037
1038// Spec for tool parameter key match metric.
1039message ToolParameterKeyMatchSpec {}
1040
1041// Spec for tool parameter key match instance.
1042message ToolParameterKeyMatchInstance {
1043  // Required. Output of the evaluated model.
1044  optional string prediction = 1 [(google.api.field_behavior) = REQUIRED];
1045
1046  // Required. Ground truth used to compare against the prediction.
1047  optional string reference = 2 [(google.api.field_behavior) = REQUIRED];
1048}
1049
1050// Results for tool parameter key match metric.
1051message ToolParameterKeyMatchResults {
1052  // Output only. Tool parameter key match metric values.
1053  repeated ToolParameterKeyMatchMetricValue
1054      tool_parameter_key_match_metric_values = 1
1055      [(google.api.field_behavior) = OUTPUT_ONLY];
1056}
1057
1058// Tool parameter key match metric value for an instance.
1059message ToolParameterKeyMatchMetricValue {
1060  // Output only. Tool parameter key match score.
1061  optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
1062}
1063
1064// Input for tool parameter key value match metric.
1065message ToolParameterKVMatchInput {
1066  // Required. Spec for tool parameter key value match metric.
1067  ToolParameterKVMatchSpec metric_spec = 1
1068      [(google.api.field_behavior) = REQUIRED];
1069
1070  // Required. Repeated tool parameter key value match instances.
1071  repeated ToolParameterKVMatchInstance instances = 2
1072      [(google.api.field_behavior) = REQUIRED];
1073}
1074
1075// Spec for tool parameter key value match metric.
1076message ToolParameterKVMatchSpec {
1077  // Optional. Whether to use STRCIT string match on parameter values.
1078  bool use_strict_string_match = 1 [(google.api.field_behavior) = OPTIONAL];
1079}
1080
1081// Spec for tool parameter key value match instance.
1082message ToolParameterKVMatchInstance {
1083  // Required. Output of the evaluated model.
1084  optional string prediction = 1 [(google.api.field_behavior) = REQUIRED];
1085
1086  // Required. Ground truth used to compare against the prediction.
1087  optional string reference = 2 [(google.api.field_behavior) = REQUIRED];
1088}
1089
1090// Results for tool parameter key value match metric.
1091message ToolParameterKVMatchResults {
1092  // Output only. Tool parameter key value match metric values.
1093  repeated ToolParameterKVMatchMetricValue
1094      tool_parameter_kv_match_metric_values = 1
1095      [(google.api.field_behavior) = OUTPUT_ONLY];
1096}
1097
1098// Tool parameter key value match metric value for an instance.
1099message ToolParameterKVMatchMetricValue {
1100  // Output only. Tool parameter key value match score.
1101  optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
1102}
1103