1// Copyright 2023 Google LLC 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15syntax = "proto3"; 16 17package google.cloud.aiplatform.v1beta1; 18 19import "google/api/annotations.proto"; 20import "google/api/client.proto"; 21import "google/api/field_behavior.proto"; 22import "google/api/resource.proto"; 23 24option csharp_namespace = "Google.Cloud.AIPlatform.V1Beta1"; 25option go_package = "cloud.google.com/go/aiplatform/apiv1beta1/aiplatformpb;aiplatformpb"; 26option java_multiple_files = true; 27option java_outer_classname = "EvaluationServiceProto"; 28option java_package = "com.google.cloud.aiplatform.v1beta1"; 29option php_namespace = "Google\\Cloud\\AIPlatform\\V1beta1"; 30option ruby_package = "Google::Cloud::AIPlatform::V1beta1"; 31 32// Vertex AI Online Evaluation Service. 33service EvaluationService { 34 option (google.api.default_host) = "aiplatform.googleapis.com"; 35 option (google.api.oauth_scopes) = 36 "https://www.googleapis.com/auth/cloud-platform"; 37 38 // Evaluates instances based on a given metric. 39 rpc EvaluateInstances(EvaluateInstancesRequest) 40 returns (EvaluateInstancesResponse) { 41 option (google.api.http) = { 42 post: "/v1beta1/{location=projects/*/locations/*}:evaluateInstances" 43 body: "*" 44 }; 45 } 46} 47 48// Pairwise prediction autorater preference. 49enum PairwiseChoice { 50 // Unspecified prediction choice. 51 PAIRWISE_CHOICE_UNSPECIFIED = 0; 52 53 // Baseline prediction wins 54 BASELINE = 1; 55 56 // Candidate prediction wins 57 CANDIDATE = 2; 58 59 // Winner cannot be determined 60 TIE = 3; 61} 62 63// Request message for EvaluationService.EvaluateInstances. 64message EvaluateInstancesRequest { 65 // Instances and specs for evaluation 66 oneof metric_inputs { 67 // Auto metric instances. 68 // Instances and metric spec for exact match metric. 69 ExactMatchInput exact_match_input = 2; 70 71 // Instances and metric spec for bleu metric. 72 BleuInput bleu_input = 3; 73 74 // Instances and metric spec for rouge metric. 75 RougeInput rouge_input = 4; 76 77 // LLM-based metric instance. 78 // General text generation metrics, applicable to other categories. 79 // Input for fluency metric. 80 FluencyInput fluency_input = 5; 81 82 // Input for coherence metric. 83 CoherenceInput coherence_input = 6; 84 85 // Input for safety metric. 86 SafetyInput safety_input = 8; 87 88 // Input for groundedness metric. 89 GroundednessInput groundedness_input = 9; 90 91 // Input for fulfillment metric. 92 FulfillmentInput fulfillment_input = 12; 93 94 // Input for summarization quality metric. 95 SummarizationQualityInput summarization_quality_input = 7; 96 97 // Input for pairwise summarization quality metric. 98 PairwiseSummarizationQualityInput pairwise_summarization_quality_input = 23; 99 100 // Input for summarization helpfulness metric. 101 SummarizationHelpfulnessInput summarization_helpfulness_input = 14; 102 103 // Input for summarization verbosity metric. 104 SummarizationVerbosityInput summarization_verbosity_input = 15; 105 106 // Input for question answering quality metric. 107 QuestionAnsweringQualityInput question_answering_quality_input = 10; 108 109 // Input for pairwise question answering quality metric. 110 PairwiseQuestionAnsweringQualityInput 111 pairwise_question_answering_quality_input = 24; 112 113 // Input for question answering relevance metric. 114 QuestionAnsweringRelevanceInput question_answering_relevance_input = 16; 115 116 // Input for question answering helpfulness 117 // metric. 118 QuestionAnsweringHelpfulnessInput question_answering_helpfulness_input = 17; 119 120 // Input for question answering correctness 121 // metric. 122 QuestionAnsweringCorrectnessInput question_answering_correctness_input = 18; 123 124 // Tool call metric instances. 125 // Input for tool call valid metric. 126 ToolCallValidInput tool_call_valid_input = 19; 127 128 // Input for tool name match metric. 129 ToolNameMatchInput tool_name_match_input = 20; 130 131 // Input for tool parameter key match metric. 132 ToolParameterKeyMatchInput tool_parameter_key_match_input = 21; 133 134 // Input for tool parameter key value match metric. 135 ToolParameterKVMatchInput tool_parameter_kv_match_input = 22; 136 } 137 138 // Required. The resource name of the Location to evaluate the instances. 139 // Format: `projects/{project}/locations/{location}` 140 string location = 1 [ 141 (google.api.field_behavior) = REQUIRED, 142 (google.api.resource_reference) = { 143 type: "locations.googleapis.com/Location" 144 } 145 ]; 146} 147 148// Response message for EvaluationService.EvaluateInstances. 149message EvaluateInstancesResponse { 150 // Evaluation results will be served in the same order as presented in 151 // EvaluationRequest.instances. 152 oneof evaluation_results { 153 // Auto metric evaluation results. 154 // Results for exact match metric. 155 ExactMatchResults exact_match_results = 1; 156 157 // Results for bleu metric. 158 BleuResults bleu_results = 2; 159 160 // Results for rouge metric. 161 RougeResults rouge_results = 3; 162 163 // LLM-based metric evaluation result. 164 // General text generation metrics, applicable to other categories. 165 // Result for fluency metric. 166 FluencyResult fluency_result = 4; 167 168 // Result for coherence metric. 169 CoherenceResult coherence_result = 5; 170 171 // Result for safety metric. 172 SafetyResult safety_result = 7; 173 174 // Result for groundedness metric. 175 GroundednessResult groundedness_result = 8; 176 177 // Result for fulfillment metric. 178 FulfillmentResult fulfillment_result = 11; 179 180 // Summarization only metrics. 181 // Result for summarization quality metric. 182 SummarizationQualityResult summarization_quality_result = 6; 183 184 // Result for pairwise summarization quality metric. 185 PairwiseSummarizationQualityResult pairwise_summarization_quality_result = 186 22; 187 188 // Result for summarization helpfulness metric. 189 SummarizationHelpfulnessResult summarization_helpfulness_result = 13; 190 191 // Result for summarization verbosity metric. 192 SummarizationVerbosityResult summarization_verbosity_result = 14; 193 194 // Question answering only metrics. 195 // Result for question answering quality metric. 196 QuestionAnsweringQualityResult question_answering_quality_result = 9; 197 198 // Result for pairwise question answering quality metric. 199 PairwiseQuestionAnsweringQualityResult 200 pairwise_question_answering_quality_result = 23; 201 202 // Result for question answering relevance metric. 203 QuestionAnsweringRelevanceResult question_answering_relevance_result = 15; 204 205 // Result for question answering helpfulness metric. 206 QuestionAnsweringHelpfulnessResult question_answering_helpfulness_result = 207 16; 208 209 // Result for question answering correctness metric. 210 QuestionAnsweringCorrectnessResult question_answering_correctness_result = 211 17; 212 213 // Tool call metrics. 214 // Results for tool call valid metric. 215 ToolCallValidResults tool_call_valid_results = 18; 216 217 // Results for tool name match metric. 218 ToolNameMatchResults tool_name_match_results = 19; 219 220 // Results for tool parameter key match metric. 221 ToolParameterKeyMatchResults tool_parameter_key_match_results = 20; 222 223 // Results for tool parameter key value match metric. 224 ToolParameterKVMatchResults tool_parameter_kv_match_results = 21; 225 } 226} 227 228// Input for exact match metric. 229message ExactMatchInput { 230 // Required. Spec for exact match metric. 231 ExactMatchSpec metric_spec = 1 [(google.api.field_behavior) = REQUIRED]; 232 233 // Required. Repeated exact match instances. 234 repeated ExactMatchInstance instances = 2 235 [(google.api.field_behavior) = REQUIRED]; 236} 237 238// Spec for exact match instance. 239message ExactMatchInstance { 240 // Required. Output of the evaluated model. 241 optional string prediction = 1 [(google.api.field_behavior) = REQUIRED]; 242 243 // Required. Ground truth used to compare against the prediction. 244 optional string reference = 2 [(google.api.field_behavior) = REQUIRED]; 245} 246 247// Spec for exact match metric - returns 1 if prediction and reference exactly 248// matches, otherwise 0. 249message ExactMatchSpec {} 250 251// Results for exact match metric. 252message ExactMatchResults { 253 // Output only. Exact match metric values. 254 repeated ExactMatchMetricValue exact_match_metric_values = 1 255 [(google.api.field_behavior) = OUTPUT_ONLY]; 256} 257 258// Exact match metric value for an instance. 259message ExactMatchMetricValue { 260 // Output only. Exact match score. 261 optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; 262} 263 264// Input for bleu metric. 265message BleuInput { 266 // Required. Spec for bleu score metric. 267 BleuSpec metric_spec = 1 [(google.api.field_behavior) = REQUIRED]; 268 269 // Required. Repeated bleu instances. 270 repeated BleuInstance instances = 2 [(google.api.field_behavior) = REQUIRED]; 271} 272 273// Spec for bleu instance. 274message BleuInstance { 275 // Required. Output of the evaluated model. 276 optional string prediction = 1 [(google.api.field_behavior) = REQUIRED]; 277 278 // Required. Ground truth used to compare against the prediction. 279 optional string reference = 2 [(google.api.field_behavior) = REQUIRED]; 280} 281 282// Spec for bleu score metric - calculates the precision of n-grams in the 283// prediction as compared to reference - returns a score ranging between 0 to 1. 284message BleuSpec {} 285 286// Results for bleu metric. 287message BleuResults { 288 // Output only. Bleu metric values. 289 repeated BleuMetricValue bleu_metric_values = 1 290 [(google.api.field_behavior) = OUTPUT_ONLY]; 291} 292 293// Bleu metric value for an instance. 294message BleuMetricValue { 295 // Output only. Bleu score. 296 optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; 297} 298 299// Input for rouge metric. 300message RougeInput { 301 // Required. Spec for rouge score metric. 302 RougeSpec metric_spec = 1 [(google.api.field_behavior) = REQUIRED]; 303 304 // Required. Repeated rouge instances. 305 repeated RougeInstance instances = 2 [(google.api.field_behavior) = REQUIRED]; 306} 307 308// Spec for rouge instance. 309message RougeInstance { 310 // Required. Output of the evaluated model. 311 optional string prediction = 1 [(google.api.field_behavior) = REQUIRED]; 312 313 // Required. Ground truth used to compare against the prediction. 314 optional string reference = 2 [(google.api.field_behavior) = REQUIRED]; 315} 316 317// Spec for rouge score metric - calculates the recall of n-grams in prediction 318// as compared to reference - returns a score ranging between 0 and 1. 319message RougeSpec { 320 // Optional. Supported rouge types are rougen[1-9], rougeL and rougeLsum. 321 string rouge_type = 1 [(google.api.field_behavior) = OPTIONAL]; 322 323 // Optional. Whether to use stemmer to compute rouge score. 324 bool use_stemmer = 2 [(google.api.field_behavior) = OPTIONAL]; 325 326 // Optional. Whether to split summaries while using rougeLsum. 327 bool split_summaries = 3 [(google.api.field_behavior) = OPTIONAL]; 328} 329 330// Results for rouge metric. 331message RougeResults { 332 // Output only. Rouge metric values. 333 repeated RougeMetricValue rouge_metric_values = 1 334 [(google.api.field_behavior) = OUTPUT_ONLY]; 335} 336 337// Rouge metric value for an instance. 338message RougeMetricValue { 339 // Output only. Rouge score. 340 optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; 341} 342 343// Input for coherence metric. 344message CoherenceInput { 345 // Required. Spec for coherence score metric. 346 CoherenceSpec metric_spec = 1 [(google.api.field_behavior) = REQUIRED]; 347 348 // Required. Coherence instance. 349 CoherenceInstance instance = 2 [(google.api.field_behavior) = REQUIRED]; 350} 351 352// Spec for coherence instance. 353message CoherenceInstance { 354 // Required. Output of the evaluated model. 355 optional string prediction = 1 [(google.api.field_behavior) = REQUIRED]; 356} 357 358// Spec for coherence score metric. 359message CoherenceSpec { 360 // Optional. Which version to use for evaluation. 361 int32 version = 1 [(google.api.field_behavior) = OPTIONAL]; 362} 363 364// Spec for coherence result. 365message CoherenceResult { 366 // Output only. Coherence score. 367 optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; 368 369 // Output only. Explanation for coherence score. 370 string explanation = 2 [(google.api.field_behavior) = OUTPUT_ONLY]; 371 372 // Output only. Confidence for coherence score. 373 optional float confidence = 3 [(google.api.field_behavior) = OUTPUT_ONLY]; 374} 375 376// Input for fluency metric. 377message FluencyInput { 378 // Required. Spec for fluency score metric. 379 FluencySpec metric_spec = 1 [(google.api.field_behavior) = REQUIRED]; 380 381 // Required. Fluency instance. 382 FluencyInstance instance = 2 [(google.api.field_behavior) = REQUIRED]; 383} 384 385// Spec for fluency instance. 386message FluencyInstance { 387 // Required. Output of the evaluated model. 388 optional string prediction = 1 [(google.api.field_behavior) = REQUIRED]; 389} 390 391// Spec for fluency score metric. 392message FluencySpec { 393 // Optional. Which version to use for evaluation. 394 int32 version = 1 [(google.api.field_behavior) = OPTIONAL]; 395} 396 397// Spec for fluency result. 398message FluencyResult { 399 // Output only. Fluency score. 400 optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; 401 402 // Output only. Explanation for fluency score. 403 string explanation = 2 [(google.api.field_behavior) = OUTPUT_ONLY]; 404 405 // Output only. Confidence for fluency score. 406 optional float confidence = 3 [(google.api.field_behavior) = OUTPUT_ONLY]; 407} 408 409// Input for safety metric. 410message SafetyInput { 411 // Required. Spec for safety metric. 412 SafetySpec metric_spec = 1 [(google.api.field_behavior) = REQUIRED]; 413 414 // Required. Safety instance. 415 SafetyInstance instance = 2 [(google.api.field_behavior) = REQUIRED]; 416} 417 418// Spec for safety instance. 419message SafetyInstance { 420 // Required. Output of the evaluated model. 421 optional string prediction = 1 [(google.api.field_behavior) = REQUIRED]; 422} 423 424// Spec for safety metric. 425message SafetySpec { 426 // Optional. Which version to use for evaluation. 427 int32 version = 1 [(google.api.field_behavior) = OPTIONAL]; 428} 429 430// Spec for safety result. 431message SafetyResult { 432 // Output only. Safety score. 433 optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; 434 435 // Output only. Explanation for safety score. 436 string explanation = 2 [(google.api.field_behavior) = OUTPUT_ONLY]; 437 438 // Output only. Confidence for safety score. 439 optional float confidence = 3 [(google.api.field_behavior) = OUTPUT_ONLY]; 440} 441 442// Input for groundedness metric. 443message GroundednessInput { 444 // Required. Spec for groundedness metric. 445 GroundednessSpec metric_spec = 1 [(google.api.field_behavior) = REQUIRED]; 446 447 // Required. Groundedness instance. 448 GroundednessInstance instance = 2 [(google.api.field_behavior) = REQUIRED]; 449} 450 451// Spec for groundedness instance. 452message GroundednessInstance { 453 // Required. Output of the evaluated model. 454 optional string prediction = 1 [(google.api.field_behavior) = REQUIRED]; 455 456 // Required. Background information provided in context used to compare 457 // against the prediction. 458 optional string context = 2 [(google.api.field_behavior) = REQUIRED]; 459} 460 461// Spec for groundedness metric. 462message GroundednessSpec { 463 // Optional. Which version to use for evaluation. 464 int32 version = 1 [(google.api.field_behavior) = OPTIONAL]; 465} 466 467// Spec for groundedness result. 468message GroundednessResult { 469 // Output only. Groundedness score. 470 optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; 471 472 // Output only. Explanation for groundedness score. 473 string explanation = 2 [(google.api.field_behavior) = OUTPUT_ONLY]; 474 475 // Output only. Confidence for groundedness score. 476 optional float confidence = 3 [(google.api.field_behavior) = OUTPUT_ONLY]; 477} 478 479// Input for fulfillment metric. 480message FulfillmentInput { 481 // Required. Spec for fulfillment score metric. 482 FulfillmentSpec metric_spec = 1 [(google.api.field_behavior) = REQUIRED]; 483 484 // Required. Fulfillment instance. 485 FulfillmentInstance instance = 2 [(google.api.field_behavior) = REQUIRED]; 486} 487 488// Spec for fulfillment instance. 489message FulfillmentInstance { 490 // Required. Output of the evaluated model. 491 optional string prediction = 1 [(google.api.field_behavior) = REQUIRED]; 492 493 // Required. Inference instruction prompt to compare prediction with. 494 optional string instruction = 2 [(google.api.field_behavior) = REQUIRED]; 495} 496 497// Spec for fulfillment metric. 498message FulfillmentSpec { 499 // Optional. Which version to use for evaluation. 500 int32 version = 1 [(google.api.field_behavior) = OPTIONAL]; 501} 502 503// Spec for fulfillment result. 504message FulfillmentResult { 505 // Output only. Fulfillment score. 506 optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; 507 508 // Output only. Explanation for fulfillment score. 509 string explanation = 2 [(google.api.field_behavior) = OUTPUT_ONLY]; 510 511 // Output only. Confidence for fulfillment score. 512 optional float confidence = 3 [(google.api.field_behavior) = OUTPUT_ONLY]; 513} 514 515// Input for summarization quality metric. 516message SummarizationQualityInput { 517 // Required. Spec for summarization quality score metric. 518 SummarizationQualitySpec metric_spec = 1 519 [(google.api.field_behavior) = REQUIRED]; 520 521 // Required. Summarization quality instance. 522 SummarizationQualityInstance instance = 2 523 [(google.api.field_behavior) = REQUIRED]; 524} 525 526// Spec for summarization quality instance. 527message SummarizationQualityInstance { 528 // Required. Output of the evaluated model. 529 optional string prediction = 1 [(google.api.field_behavior) = REQUIRED]; 530 531 // Optional. Ground truth used to compare against the prediction. 532 optional string reference = 2 [(google.api.field_behavior) = OPTIONAL]; 533 534 // Required. Text to be summarized. 535 optional string context = 3 [(google.api.field_behavior) = REQUIRED]; 536 537 // Required. Summarization prompt for LLM. 538 optional string instruction = 4 [(google.api.field_behavior) = REQUIRED]; 539} 540 541// Spec for summarization quality score metric. 542message SummarizationQualitySpec { 543 // Optional. Whether to use instance.reference to compute summarization 544 // quality. 545 bool use_reference = 1 [(google.api.field_behavior) = OPTIONAL]; 546 547 // Optional. Which version to use for evaluation. 548 int32 version = 2 [(google.api.field_behavior) = OPTIONAL]; 549} 550 551// Spec for summarization quality result. 552message SummarizationQualityResult { 553 // Output only. Summarization Quality score. 554 optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; 555 556 // Output only. Explanation for summarization quality score. 557 string explanation = 2 [(google.api.field_behavior) = OUTPUT_ONLY]; 558 559 // Output only. Confidence for summarization quality score. 560 optional float confidence = 3 [(google.api.field_behavior) = OUTPUT_ONLY]; 561} 562 563// Input for pairwise summarization quality metric. 564message PairwiseSummarizationQualityInput { 565 // Required. Spec for pairwise summarization quality score metric. 566 PairwiseSummarizationQualitySpec metric_spec = 1 567 [(google.api.field_behavior) = REQUIRED]; 568 569 // Required. Pairwise summarization quality instance. 570 PairwiseSummarizationQualityInstance instance = 2 571 [(google.api.field_behavior) = REQUIRED]; 572} 573 574// Spec for pairwise summarization quality instance. 575message PairwiseSummarizationQualityInstance { 576 // Required. Output of the candidate model. 577 optional string prediction = 1 [(google.api.field_behavior) = REQUIRED]; 578 579 // Required. Output of the baseline model. 580 optional string baseline_prediction = 2 581 [(google.api.field_behavior) = REQUIRED]; 582 583 // Optional. Ground truth used to compare against the prediction. 584 optional string reference = 3 [(google.api.field_behavior) = OPTIONAL]; 585 586 // Required. Text to be summarized. 587 optional string context = 4 [(google.api.field_behavior) = REQUIRED]; 588 589 // Required. Summarization prompt for LLM. 590 optional string instruction = 5 [(google.api.field_behavior) = REQUIRED]; 591} 592 593// Spec for pairwise summarization quality score metric. 594message PairwiseSummarizationQualitySpec { 595 // Optional. Whether to use instance.reference to compute pairwise 596 // summarization quality. 597 bool use_reference = 1 [(google.api.field_behavior) = OPTIONAL]; 598 599 // Optional. Which version to use for evaluation. 600 int32 version = 2 [(google.api.field_behavior) = OPTIONAL]; 601} 602 603// Spec for pairwise summarization quality result. 604message PairwiseSummarizationQualityResult { 605 // Output only. Pairwise summarization prediction choice. 606 PairwiseChoice pairwise_choice = 1 607 [(google.api.field_behavior) = OUTPUT_ONLY]; 608 609 // Output only. Explanation for summarization quality score. 610 string explanation = 2 [(google.api.field_behavior) = OUTPUT_ONLY]; 611 612 // Output only. Confidence for summarization quality score. 613 optional float confidence = 3 [(google.api.field_behavior) = OUTPUT_ONLY]; 614} 615 616// Input for summarization helpfulness metric. 617message SummarizationHelpfulnessInput { 618 // Required. Spec for summarization helpfulness score metric. 619 SummarizationHelpfulnessSpec metric_spec = 1 620 [(google.api.field_behavior) = REQUIRED]; 621 622 // Required. Summarization helpfulness instance. 623 SummarizationHelpfulnessInstance instance = 2 624 [(google.api.field_behavior) = REQUIRED]; 625} 626 627// Spec for summarization helpfulness instance. 628message SummarizationHelpfulnessInstance { 629 // Required. Output of the evaluated model. 630 optional string prediction = 1 [(google.api.field_behavior) = REQUIRED]; 631 632 // Optional. Ground truth used to compare against the prediction. 633 optional string reference = 2 [(google.api.field_behavior) = OPTIONAL]; 634 635 // Required. Text to be summarized. 636 optional string context = 3 [(google.api.field_behavior) = REQUIRED]; 637 638 // Optional. Summarization prompt for LLM. 639 optional string instruction = 4 [(google.api.field_behavior) = OPTIONAL]; 640} 641 642// Spec for summarization helpfulness score metric. 643message SummarizationHelpfulnessSpec { 644 // Optional. Whether to use instance.reference to compute summarization 645 // helpfulness. 646 bool use_reference = 1 [(google.api.field_behavior) = OPTIONAL]; 647 648 // Optional. Which version to use for evaluation. 649 int32 version = 2 [(google.api.field_behavior) = OPTIONAL]; 650} 651 652// Spec for summarization helpfulness result. 653message SummarizationHelpfulnessResult { 654 // Output only. Summarization Helpfulness score. 655 optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; 656 657 // Output only. Explanation for summarization helpfulness score. 658 string explanation = 2 [(google.api.field_behavior) = OUTPUT_ONLY]; 659 660 // Output only. Confidence for summarization helpfulness score. 661 optional float confidence = 3 [(google.api.field_behavior) = OUTPUT_ONLY]; 662} 663 664// Input for summarization verbosity metric. 665message SummarizationVerbosityInput { 666 // Required. Spec for summarization verbosity score metric. 667 SummarizationVerbositySpec metric_spec = 1 668 [(google.api.field_behavior) = REQUIRED]; 669 670 // Required. Summarization verbosity instance. 671 SummarizationVerbosityInstance instance = 2 672 [(google.api.field_behavior) = REQUIRED]; 673} 674 675// Spec for summarization verbosity instance. 676message SummarizationVerbosityInstance { 677 // Required. Output of the evaluated model. 678 optional string prediction = 1 [(google.api.field_behavior) = REQUIRED]; 679 680 // Optional. Ground truth used to compare against the prediction. 681 optional string reference = 2 [(google.api.field_behavior) = OPTIONAL]; 682 683 // Required. Text to be summarized. 684 optional string context = 3 [(google.api.field_behavior) = REQUIRED]; 685 686 // Optional. Summarization prompt for LLM. 687 optional string instruction = 4 [(google.api.field_behavior) = OPTIONAL]; 688} 689 690// Spec for summarization verbosity score metric. 691message SummarizationVerbositySpec { 692 // Optional. Whether to use instance.reference to compute summarization 693 // verbosity. 694 bool use_reference = 1 [(google.api.field_behavior) = OPTIONAL]; 695 696 // Optional. Which version to use for evaluation. 697 int32 version = 2 [(google.api.field_behavior) = OPTIONAL]; 698} 699 700// Spec for summarization verbosity result. 701message SummarizationVerbosityResult { 702 // Output only. Summarization Verbosity score. 703 optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; 704 705 // Output only. Explanation for summarization verbosity score. 706 string explanation = 2 [(google.api.field_behavior) = OUTPUT_ONLY]; 707 708 // Output only. Confidence for summarization verbosity score. 709 optional float confidence = 3 [(google.api.field_behavior) = OUTPUT_ONLY]; 710} 711 712// Input for question answering quality metric. 713message QuestionAnsweringQualityInput { 714 // Required. Spec for question answering quality score metric. 715 QuestionAnsweringQualitySpec metric_spec = 1 716 [(google.api.field_behavior) = REQUIRED]; 717 718 // Required. Question answering quality instance. 719 QuestionAnsweringQualityInstance instance = 2 720 [(google.api.field_behavior) = REQUIRED]; 721} 722 723// Spec for question answering quality instance. 724message QuestionAnsweringQualityInstance { 725 // Required. Output of the evaluated model. 726 optional string prediction = 1 [(google.api.field_behavior) = REQUIRED]; 727 728 // Optional. Ground truth used to compare against the prediction. 729 optional string reference = 2 [(google.api.field_behavior) = OPTIONAL]; 730 731 // Required. Text to answer the question. 732 optional string context = 3 [(google.api.field_behavior) = REQUIRED]; 733 734 // Required. Question Answering prompt for LLM. 735 optional string instruction = 4 [(google.api.field_behavior) = REQUIRED]; 736} 737 738// Spec for question answering quality score metric. 739message QuestionAnsweringQualitySpec { 740 // Optional. Whether to use instance.reference to compute question answering 741 // quality. 742 bool use_reference = 1 [(google.api.field_behavior) = OPTIONAL]; 743 744 // Optional. Which version to use for evaluation. 745 int32 version = 2 [(google.api.field_behavior) = OPTIONAL]; 746} 747 748// Spec for question answering quality result. 749message QuestionAnsweringQualityResult { 750 // Output only. Question Answering Quality score. 751 optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; 752 753 // Output only. Explanation for question answering quality score. 754 string explanation = 2 [(google.api.field_behavior) = OUTPUT_ONLY]; 755 756 // Output only. Confidence for question answering quality score. 757 optional float confidence = 3 [(google.api.field_behavior) = OUTPUT_ONLY]; 758} 759 760// Input for pairwise question answering quality metric. 761message PairwiseQuestionAnsweringQualityInput { 762 // Required. Spec for pairwise question answering quality score metric. 763 PairwiseQuestionAnsweringQualitySpec metric_spec = 1 764 [(google.api.field_behavior) = REQUIRED]; 765 766 // Required. Pairwise question answering quality instance. 767 PairwiseQuestionAnsweringQualityInstance instance = 2 768 [(google.api.field_behavior) = REQUIRED]; 769} 770 771// Spec for pairwise question answering quality instance. 772message PairwiseQuestionAnsweringQualityInstance { 773 // Required. Output of the candidate model. 774 optional string prediction = 1 [(google.api.field_behavior) = REQUIRED]; 775 776 // Required. Output of the baseline model. 777 optional string baseline_prediction = 2 778 [(google.api.field_behavior) = REQUIRED]; 779 780 // Optional. Ground truth used to compare against the prediction. 781 optional string reference = 3 [(google.api.field_behavior) = OPTIONAL]; 782 783 // Required. Text to answer the question. 784 optional string context = 4 [(google.api.field_behavior) = REQUIRED]; 785 786 // Required. Question Answering prompt for LLM. 787 optional string instruction = 5 [(google.api.field_behavior) = REQUIRED]; 788} 789 790// Spec for pairwise question answering quality score metric. 791message PairwiseQuestionAnsweringQualitySpec { 792 // Optional. Whether to use instance.reference to compute question answering 793 // quality. 794 bool use_reference = 1 [(google.api.field_behavior) = OPTIONAL]; 795 796 // Optional. Which version to use for evaluation. 797 int32 version = 2 [(google.api.field_behavior) = OPTIONAL]; 798} 799 800// Spec for pairwise question answering quality result. 801message PairwiseQuestionAnsweringQualityResult { 802 // Output only. Pairwise question answering prediction choice. 803 PairwiseChoice pairwise_choice = 1 804 [(google.api.field_behavior) = OUTPUT_ONLY]; 805 806 // Output only. Explanation for question answering quality score. 807 string explanation = 2 [(google.api.field_behavior) = OUTPUT_ONLY]; 808 809 // Output only. Confidence for question answering quality score. 810 optional float confidence = 3 [(google.api.field_behavior) = OUTPUT_ONLY]; 811} 812 813// Input for question answering relevance metric. 814message QuestionAnsweringRelevanceInput { 815 // Required. Spec for question answering relevance score metric. 816 QuestionAnsweringRelevanceSpec metric_spec = 1 817 [(google.api.field_behavior) = REQUIRED]; 818 819 // Required. Question answering relevance instance. 820 QuestionAnsweringRelevanceInstance instance = 2 821 [(google.api.field_behavior) = REQUIRED]; 822} 823 824// Spec for question answering relevance instance. 825message QuestionAnsweringRelevanceInstance { 826 // Required. Output of the evaluated model. 827 optional string prediction = 1 [(google.api.field_behavior) = REQUIRED]; 828 829 // Optional. Ground truth used to compare against the prediction. 830 optional string reference = 2 [(google.api.field_behavior) = OPTIONAL]; 831 832 // Optional. Text provided as context to answer the question. 833 optional string context = 3 [(google.api.field_behavior) = OPTIONAL]; 834 835 // Required. The question asked and other instruction in the inference prompt. 836 optional string instruction = 4 [(google.api.field_behavior) = REQUIRED]; 837} 838 839// Spec for question answering relevance metric. 840message QuestionAnsweringRelevanceSpec { 841 // Optional. Whether to use instance.reference to compute question answering 842 // relevance. 843 bool use_reference = 1 [(google.api.field_behavior) = OPTIONAL]; 844 845 // Optional. Which version to use for evaluation. 846 int32 version = 2 [(google.api.field_behavior) = OPTIONAL]; 847} 848 849// Spec for question answering relevance result. 850message QuestionAnsweringRelevanceResult { 851 // Output only. Question Answering Relevance score. 852 optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; 853 854 // Output only. Explanation for question answering relevance score. 855 string explanation = 2 [(google.api.field_behavior) = OUTPUT_ONLY]; 856 857 // Output only. Confidence for question answering relevance score. 858 optional float confidence = 3 [(google.api.field_behavior) = OUTPUT_ONLY]; 859} 860 861// Input for question answering helpfulness metric. 862message QuestionAnsweringHelpfulnessInput { 863 // Required. Spec for question answering helpfulness score metric. 864 QuestionAnsweringHelpfulnessSpec metric_spec = 1 865 [(google.api.field_behavior) = REQUIRED]; 866 867 // Required. Question answering helpfulness instance. 868 QuestionAnsweringHelpfulnessInstance instance = 2 869 [(google.api.field_behavior) = REQUIRED]; 870} 871 872// Spec for question answering helpfulness instance. 873message QuestionAnsweringHelpfulnessInstance { 874 // Required. Output of the evaluated model. 875 optional string prediction = 1 [(google.api.field_behavior) = REQUIRED]; 876 877 // Optional. Ground truth used to compare against the prediction. 878 optional string reference = 2 [(google.api.field_behavior) = OPTIONAL]; 879 880 // Optional. Text provided as context to answer the question. 881 optional string context = 3 [(google.api.field_behavior) = OPTIONAL]; 882 883 // Required. The question asked and other instruction in the inference prompt. 884 optional string instruction = 4 [(google.api.field_behavior) = REQUIRED]; 885} 886 887// Spec for question answering helpfulness metric. 888message QuestionAnsweringHelpfulnessSpec { 889 // Optional. Whether to use instance.reference to compute question answering 890 // helpfulness. 891 bool use_reference = 1 [(google.api.field_behavior) = OPTIONAL]; 892 893 // Optional. Which version to use for evaluation. 894 int32 version = 2 [(google.api.field_behavior) = OPTIONAL]; 895} 896 897// Spec for question answering helpfulness result. 898message QuestionAnsweringHelpfulnessResult { 899 // Output only. Question Answering Helpfulness score. 900 optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; 901 902 // Output only. Explanation for question answering helpfulness score. 903 string explanation = 2 [(google.api.field_behavior) = OUTPUT_ONLY]; 904 905 // Output only. Confidence for question answering helpfulness score. 906 optional float confidence = 3 [(google.api.field_behavior) = OUTPUT_ONLY]; 907} 908 909// Input for question answering correctness metric. 910message QuestionAnsweringCorrectnessInput { 911 // Required. Spec for question answering correctness score metric. 912 QuestionAnsweringCorrectnessSpec metric_spec = 1 913 [(google.api.field_behavior) = REQUIRED]; 914 915 // Required. Question answering correctness instance. 916 QuestionAnsweringCorrectnessInstance instance = 2 917 [(google.api.field_behavior) = REQUIRED]; 918} 919 920// Spec for question answering correctness instance. 921message QuestionAnsweringCorrectnessInstance { 922 // Required. Output of the evaluated model. 923 optional string prediction = 1 [(google.api.field_behavior) = REQUIRED]; 924 925 // Optional. Ground truth used to compare against the prediction. 926 optional string reference = 2 [(google.api.field_behavior) = OPTIONAL]; 927 928 // Optional. Text provided as context to answer the question. 929 optional string context = 3 [(google.api.field_behavior) = OPTIONAL]; 930 931 // Required. The question asked and other instruction in the inference prompt. 932 optional string instruction = 4 [(google.api.field_behavior) = REQUIRED]; 933} 934 935// Spec for question answering correctness metric. 936message QuestionAnsweringCorrectnessSpec { 937 // Optional. Whether to use instance.reference to compute question answering 938 // correctness. 939 bool use_reference = 1 [(google.api.field_behavior) = OPTIONAL]; 940 941 // Optional. Which version to use for evaluation. 942 int32 version = 2 [(google.api.field_behavior) = OPTIONAL]; 943} 944 945// Spec for question answering correctness result. 946message QuestionAnsweringCorrectnessResult { 947 // Output only. Question Answering Correctness score. 948 optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; 949 950 // Output only. Explanation for question answering correctness score. 951 string explanation = 2 [(google.api.field_behavior) = OUTPUT_ONLY]; 952 953 // Output only. Confidence for question answering correctness score. 954 optional float confidence = 3 [(google.api.field_behavior) = OUTPUT_ONLY]; 955} 956 957// Input for tool call valid metric. 958message ToolCallValidInput { 959 // Required. Spec for tool call valid metric. 960 ToolCallValidSpec metric_spec = 1 [(google.api.field_behavior) = REQUIRED]; 961 962 // Required. Repeated tool call valid instances. 963 repeated ToolCallValidInstance instances = 2 964 [(google.api.field_behavior) = REQUIRED]; 965} 966 967// Spec for tool call valid metric. 968message ToolCallValidSpec {} 969 970// Spec for tool call valid instance. 971message ToolCallValidInstance { 972 // Required. Output of the evaluated model. 973 optional string prediction = 1 [(google.api.field_behavior) = REQUIRED]; 974 975 // Required. Ground truth used to compare against the prediction. 976 optional string reference = 2 [(google.api.field_behavior) = REQUIRED]; 977} 978 979// Results for tool call valid metric. 980message ToolCallValidResults { 981 // Output only. Tool call valid metric values. 982 repeated ToolCallValidMetricValue tool_call_valid_metric_values = 1 983 [(google.api.field_behavior) = OUTPUT_ONLY]; 984} 985 986// Tool call valid metric value for an instance. 987message ToolCallValidMetricValue { 988 // Output only. Tool call valid score. 989 optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; 990} 991 992// Input for tool name match metric. 993message ToolNameMatchInput { 994 // Required. Spec for tool name match metric. 995 ToolNameMatchSpec metric_spec = 1 [(google.api.field_behavior) = REQUIRED]; 996 997 // Required. Repeated tool name match instances. 998 repeated ToolNameMatchInstance instances = 2 999 [(google.api.field_behavior) = REQUIRED]; 1000} 1001 1002// Spec for tool name match metric. 1003message ToolNameMatchSpec {} 1004 1005// Spec for tool name match instance. 1006message ToolNameMatchInstance { 1007 // Required. Output of the evaluated model. 1008 optional string prediction = 1 [(google.api.field_behavior) = REQUIRED]; 1009 1010 // Required. Ground truth used to compare against the prediction. 1011 optional string reference = 2 [(google.api.field_behavior) = REQUIRED]; 1012} 1013 1014// Results for tool name match metric. 1015message ToolNameMatchResults { 1016 // Output only. Tool name match metric values. 1017 repeated ToolNameMatchMetricValue tool_name_match_metric_values = 1 1018 [(google.api.field_behavior) = OUTPUT_ONLY]; 1019} 1020 1021// Tool name match metric value for an instance. 1022message ToolNameMatchMetricValue { 1023 // Output only. Tool name match score. 1024 optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; 1025} 1026 1027// Input for tool parameter key match metric. 1028message ToolParameterKeyMatchInput { 1029 // Required. Spec for tool parameter key match metric. 1030 ToolParameterKeyMatchSpec metric_spec = 1 1031 [(google.api.field_behavior) = REQUIRED]; 1032 1033 // Required. Repeated tool parameter key match instances. 1034 repeated ToolParameterKeyMatchInstance instances = 2 1035 [(google.api.field_behavior) = REQUIRED]; 1036} 1037 1038// Spec for tool parameter key match metric. 1039message ToolParameterKeyMatchSpec {} 1040 1041// Spec for tool parameter key match instance. 1042message ToolParameterKeyMatchInstance { 1043 // Required. Output of the evaluated model. 1044 optional string prediction = 1 [(google.api.field_behavior) = REQUIRED]; 1045 1046 // Required. Ground truth used to compare against the prediction. 1047 optional string reference = 2 [(google.api.field_behavior) = REQUIRED]; 1048} 1049 1050// Results for tool parameter key match metric. 1051message ToolParameterKeyMatchResults { 1052 // Output only. Tool parameter key match metric values. 1053 repeated ToolParameterKeyMatchMetricValue 1054 tool_parameter_key_match_metric_values = 1 1055 [(google.api.field_behavior) = OUTPUT_ONLY]; 1056} 1057 1058// Tool parameter key match metric value for an instance. 1059message ToolParameterKeyMatchMetricValue { 1060 // Output only. Tool parameter key match score. 1061 optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; 1062} 1063 1064// Input for tool parameter key value match metric. 1065message ToolParameterKVMatchInput { 1066 // Required. Spec for tool parameter key value match metric. 1067 ToolParameterKVMatchSpec metric_spec = 1 1068 [(google.api.field_behavior) = REQUIRED]; 1069 1070 // Required. Repeated tool parameter key value match instances. 1071 repeated ToolParameterKVMatchInstance instances = 2 1072 [(google.api.field_behavior) = REQUIRED]; 1073} 1074 1075// Spec for tool parameter key value match metric. 1076message ToolParameterKVMatchSpec { 1077 // Optional. Whether to use STRCIT string match on parameter values. 1078 bool use_strict_string_match = 1 [(google.api.field_behavior) = OPTIONAL]; 1079} 1080 1081// Spec for tool parameter key value match instance. 1082message ToolParameterKVMatchInstance { 1083 // Required. Output of the evaluated model. 1084 optional string prediction = 1 [(google.api.field_behavior) = REQUIRED]; 1085 1086 // Required. Ground truth used to compare against the prediction. 1087 optional string reference = 2 [(google.api.field_behavior) = REQUIRED]; 1088} 1089 1090// Results for tool parameter key value match metric. 1091message ToolParameterKVMatchResults { 1092 // Output only. Tool parameter key value match metric values. 1093 repeated ToolParameterKVMatchMetricValue 1094 tool_parameter_kv_match_metric_values = 1 1095 [(google.api.field_behavior) = OUTPUT_ONLY]; 1096} 1097 1098// Tool parameter key value match metric value for an instance. 1099message ToolParameterKVMatchMetricValue { 1100 // Output only. Tool parameter key value match score. 1101 optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; 1102} 1103