xref: /aosp_15_r20/external/tensorflow/tensorflow/core/kernels/string_ngrams_op_test.cc (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #include <vector>
16 
17 #include "tensorflow/core/framework/fake_input.h"
18 #include "tensorflow/core/framework/node_def_builder.h"
19 #include "tensorflow/core/framework/shape_inference.h"
20 #include "tensorflow/core/framework/shape_inference_testutil.h"
21 #include "tensorflow/core/framework/tensor.h"
22 #include "tensorflow/core/framework/tensor_shape.h"
23 #include "tensorflow/core/framework/tensor_testutil.h"
24 #include "tensorflow/core/framework/types.pb.h"
25 #include "tensorflow/core/kernels/ops_testutil.h"
26 #include "tensorflow/core/lib/core/status.h"
27 #include "tensorflow/core/lib/core/status_test_util.h"
28 
29 namespace tensorflow {
30 namespace text {
31 
32 using tensorflow::FakeInput;
33 using tensorflow::NodeDefBuilder;
34 using tensorflow::Status;
35 using tensorflow::TensorShape;
36 
37 class NgramKernelTest : public tensorflow::OpsTestBase {
38  public:
MakeOp(string separator,std::vector<int> ngram_width,string left_pad,string right_pad,int pad_width,bool preserve)39   void MakeOp(string separator, std::vector<int> ngram_width, string left_pad,
40               string right_pad, int pad_width, bool preserve) {
41     TF_ASSERT_OK(NodeDefBuilder("tested_op", "StringNGrams")
42                      .Attr("separator", separator)
43                      .Attr("ngram_widths", ngram_width)
44                      .Attr("left_pad", left_pad)
45                      .Attr("right_pad", right_pad)
46                      .Attr("pad_width", pad_width)
47                      .Attr("preserve_short_sequences", preserve)
48                      .Input(FakeInput())
49                      .Input(FakeInput())
50                      .Finalize(node_def()));
51     TF_ASSERT_OK(InitOp());
52   }
53 
assert_string_equal(const std::vector<tstring> & expected,const Tensor & value)54   void assert_string_equal(const std::vector<tstring> &expected,
55                            const Tensor &value) {
56     Tensor expected_tensor(
57         allocator(), DT_STRING,
58         TensorShape({static_cast<int64_t>(expected.size())}));
59     test::FillValues<tstring>(&expected_tensor, expected);
60     test::ExpectTensorEqual<tstring>(expected_tensor, value);
61   }
assert_int64_equal(const std::vector<int64_t> & expected,const Tensor & value)62   void assert_int64_equal(const std::vector<int64_t> &expected,
63                           const Tensor &value) {
64     Tensor expected_tensor(
65         allocator(), DT_INT64,
66         TensorShape({static_cast<int64_t>(expected.size())}));
67     test::FillValues<int64_t>(&expected_tensor, expected);
68     test::ExpectTensorEqual<int64_t>(expected_tensor, value);
69   }
70 };
71 
TEST_F(NgramKernelTest,TestPaddedTrigrams)72 TEST_F(NgramKernelTest, TestPaddedTrigrams) {
73   MakeOp("|", {3}, "LP", "RP", -1, false);
74   // Batch items are:
75   // 0: "a", "b", "c", "d"
76   // 1: "e", "f"
77   AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
78   AddInputFromArray<int64_t>(TensorShape({3}), {0, 4, 6});
79   TF_ASSERT_OK(RunOpKernel());
80 
81   std::vector<tstring> expected_values(                             //
82       {"LP|LP|a", "LP|a|b", "a|b|c", "b|c|d", "c|d|RP", "d|RP|RP",  // 0
83        "LP|LP|e", "LP|e|f", "e|f|RP", "f|RP|RP"});                  // 1
84   std::vector<int64_t> expected_splits({0, 6, 10});
85 
86   assert_string_equal(expected_values, *GetOutput(0));
87   assert_int64_equal(expected_splits, *GetOutput(1));
88 }
89 
TEST_F(NgramKernelTest,TestPaddedBigramsAndTrigrams)90 TEST_F(NgramKernelTest, TestPaddedBigramsAndTrigrams) {
91   MakeOp("|", {2, 3}, "LP", "RP", -1, false);
92   // Batch items are:
93   // 0: "a", "b", "c", "d"
94   // 1: "e", "f"
95   AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
96   AddInputFromArray<int64_t>(TensorShape({3}), {0, 4, 6});
97   TF_ASSERT_OK(RunOpKernel());
98 
99   std::vector<tstring> expected_values(
100       {"LP|a", "a|b", "b|c", "c|d", "d|RP", "LP|LP|a", "LP|a|b", "a|b|c",
101        "b|c|d", "c|d|RP", "d|RP|RP",                                       // 0
102        "LP|e", "e|f", "f|RP", "LP|LP|e", "LP|e|f", "e|f|RP", "f|RP|RP"});  // 1
103   std::vector<int64_t> expected_splits({0, 11, 18});
104 
105   assert_string_equal(expected_values, *GetOutput(0));
106   assert_int64_equal(expected_splits, *GetOutput(1));
107 }
108 
TEST_F(NgramKernelTest,TestPaddedBigrams)109 TEST_F(NgramKernelTest, TestPaddedBigrams) {
110   MakeOp("|", {2}, "LP", "RP", -1, false);
111   // Batch items are:
112   // 0: "a", "b", "c", "d"
113   // 1: "e", "f"
114   AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
115   AddInputFromArray<int64_t>(TensorShape({3}), {0, 4, 6});
116   TF_ASSERT_OK(RunOpKernel());
117 
118   std::vector<tstring> expected_values(      //
119       {"LP|a", "a|b", "b|c", "c|d", "d|RP",  // 0
120        "LP|e", "e|f", "f|RP"});              // 1
121   std::vector<int64_t> expected_splits({0, 5, 8});
122 
123   assert_string_equal(expected_values, *GetOutput(0));
124   assert_int64_equal(expected_splits, *GetOutput(1));
125 }
126 
TEST_F(NgramKernelTest,TestPaddingIsAtMostNGramSizeMinus1)127 TEST_F(NgramKernelTest, TestPaddingIsAtMostNGramSizeMinus1) {
128   MakeOp("|", {2}, "LP", "RP", 4, false);
129   // Batch items are:
130   // 0: "a", "b", "c", "d"
131   // 1: "e", "f"
132   AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
133   AddInputFromArray<int64_t>(TensorShape({3}), {0, 4, 6});
134   TF_ASSERT_OK(RunOpKernel());
135 
136   std::vector<tstring> expected_values(      //
137       {"LP|a", "a|b", "b|c", "c|d", "d|RP",  // 0
138        "LP|e", "e|f", "f|RP"});              // 1
139   std::vector<int64_t> expected_splits({0, 5, 8});
140 
141   assert_string_equal(expected_values, *GetOutput(0));
142   assert_int64_equal(expected_splits, *GetOutput(1));
143 }
144 
TEST_F(NgramKernelTest,TestPaddedUnigramAndBigrams)145 TEST_F(NgramKernelTest, TestPaddedUnigramAndBigrams) {
146   MakeOp("|", {1, 2}, "LP", "RP", -1, false);
147   // Batch items are:
148   // 0: "a", "b", "c", "d"
149   // 1: "e", "f"
150   AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
151   AddInputFromArray<int64_t>(TensorShape({3}), {0, 4, 6});
152   TF_ASSERT_OK(RunOpKernel());
153 
154   std::vector<tstring> expected_values(                          //
155       {"a", "b", "c", "d", "LP|a", "a|b", "b|c", "c|d", "d|RP",  // 0
156        "e", "f", "LP|e", "e|f", "f|RP"});                        // 1
157   std::vector<int64_t> expected_splits({0, 9, 14});
158 
159   assert_string_equal(expected_values, *GetOutput(0));
160   assert_int64_equal(expected_splits, *GetOutput(1));
161 }
162 
TEST_F(NgramKernelTest,TestOverlappingPaddedNGrams)163 TEST_F(NgramKernelTest, TestOverlappingPaddedNGrams) {
164   // This test validates that n-grams with both left and right padding in a
165   // single ngram token are created correctly.
166   MakeOp("|", {3}, "LP", "RP", -1, false);
167   // Batch items are:
168   // 0: "a"
169   // 1: "b", "c", "d"
170   // 2: "e", "f"
171   AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
172   AddInputFromArray<int64_t>(TensorShape({4}), {0, 1, 4, 6});
173   TF_ASSERT_OK(RunOpKernel());
174 
175   std::vector<tstring> expected_values(                    //
176       {"LP|LP|a", "LP|a|RP", "a|RP|RP",                    // ngrams for elem. 0
177        "LP|LP|b", "LP|b|c", "b|c|d", "c|d|RP", "d|RP|RP",  // ngrams for elem. 1
178        "LP|LP|e", "LP|e|f", "e|f|RP", "f|RP|RP"});         // ngrams for elem. 2
179   std::vector<int64_t> expected_splits({0, 3, 8, 12});
180 
181   assert_string_equal(expected_values, *GetOutput(0));
182   assert_int64_equal(expected_splits, *GetOutput(1));
183 }
184 
TEST_F(NgramKernelTest,TestOverlappingPaddedMultiCharNGrams)185 TEST_F(NgramKernelTest, TestOverlappingPaddedMultiCharNGrams) {
186   MakeOp("|", {3}, "LP", "RP", -1, false);
187   // Batch items are:
188   // 0: "a"
189   // 1: "b", "c", "d"
190   // 2: "e", "f"
191   AddInputFromArray<tstring>(TensorShape({6}),
192                              {"aa", "bb", "cc", "dd", "ee", "ff"});
193   AddInputFromArray<int64_t>(TensorShape({4}), {0, 1, 4, 6});
194   TF_ASSERT_OK(RunOpKernel());
195 
196   std::vector<tstring> expected_values(                             //
197       {"LP|LP|aa", "LP|aa|RP", "aa|RP|RP",                          //
198        "LP|LP|bb", "LP|bb|cc", "bb|cc|dd", "cc|dd|RP", "dd|RP|RP",  //
199        "LP|LP|ee", "LP|ee|ff", "ee|ff|RP", "ff|RP|RP"});            //
200   std::vector<int64_t> expected_splits({0, 3, 8, 12});
201 
202   assert_string_equal(expected_values, *GetOutput(0));
203   assert_int64_equal(expected_splits, *GetOutput(1));
204 }
205 
TEST_F(NgramKernelTest,TestMultiOverlappingPaddedNGrams)206 TEST_F(NgramKernelTest, TestMultiOverlappingPaddedNGrams) {
207   // This test validates that n-grams with more than 1 padding value on each
208   // side are created correctly.
209   MakeOp("|", {5}, "LP", "RP", -1, false);
210   // Batch items are:
211   // 0: "a"
212   AddInputFromArray<tstring>(TensorShape({1}), {"a"});
213   AddInputFromArray<int64_t>(TensorShape({2}), {0, 1});
214   TF_ASSERT_OK(RunOpKernel());
215 
216   std::vector<tstring> expected_values({"LP|LP|LP|LP|a", "LP|LP|LP|a|RP",
217                                         "LP|LP|a|RP|RP", "LP|a|RP|RP|RP",
218                                         "a|RP|RP|RP|RP"});
219   std::vector<int64_t> expected_splits({0, 5});
220 
221   assert_string_equal(expected_values, *GetOutput(0));
222   assert_int64_equal(expected_splits, *GetOutput(1));
223 }
224 
TEST_F(NgramKernelTest,TestUnpaddedTrigrams)225 TEST_F(NgramKernelTest, TestUnpaddedTrigrams) {
226   MakeOp("|", {3}, "", "", 0, false);
227   // Batch items are:
228   // 0: "a", "b", "c", "d"
229   // 1: "e", "f"
230   AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
231   AddInputFromArray<int64_t>(TensorShape({3}), {0, 4, 6});
232   TF_ASSERT_OK(RunOpKernel());
233 
234   std::vector<tstring> expected_values({"a|b|c", "b|c|d"});
235   std::vector<int64_t> expected_splits({0, 2, 2});
236 
237   assert_string_equal(expected_values, *GetOutput(0));
238   assert_int64_equal(expected_splits, *GetOutput(1));
239 }
240 
TEST_F(NgramKernelTest,TestUnpaddedTrigramsWithEmptySequence)241 TEST_F(NgramKernelTest, TestUnpaddedTrigramsWithEmptySequence) {
242   MakeOp("|", {3}, "", "", 0, false);
243   // Batch items are:
244   // 0: "a", "b", "c", "d"
245   // 1: "e", "f"
246   AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
247   AddInputFromArray<int64_t>(TensorShape({4}), {0, 4, 4, 6});
248   TF_ASSERT_OK(RunOpKernel());
249 
250   std::vector<tstring> expected_values({"a|b|c", "b|c|d"});
251   std::vector<int64_t> expected_splits({0, 2, 2, 2});
252 
253   assert_string_equal(expected_values, *GetOutput(0));
254   assert_int64_equal(expected_splits, *GetOutput(1));
255 }
256 
TEST_F(NgramKernelTest,TestUnpaddedTrigramsWithPreserveShort)257 TEST_F(NgramKernelTest, TestUnpaddedTrigramsWithPreserveShort) {
258   MakeOp("|", {3}, "", "", 0, true);
259   // Batch items are:
260   // 0: "a", "b", "c", "d"
261   // 1: "e", "f"
262   AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
263   AddInputFromArray<int64_t>(TensorShape({3}), {0, 4, 6});
264   TF_ASSERT_OK(RunOpKernel());
265 
266   std::vector<tstring> expected_values({"a|b|c", "b|c|d", "e|f"});
267   std::vector<int64_t> expected_splits({0, 2, 3});
268 
269   assert_string_equal(expected_values, *GetOutput(0));
270   assert_int64_equal(expected_splits, *GetOutput(1));
271 }
272 
TEST_F(NgramKernelTest,TestUnpaddedTrigramsWithPreserveShortAndEmptySequence)273 TEST_F(NgramKernelTest, TestUnpaddedTrigramsWithPreserveShortAndEmptySequence) {
274   MakeOp("|", {3}, "", "", 0, true);
275   // Batch items are:
276   // 0: "a", "b", "c", "d"
277   // 1: "e", "f"
278   AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
279   AddInputFromArray<int64_t>(TensorShape({4}), {0, 4, 4, 6});
280   TF_ASSERT_OK(RunOpKernel());
281 
282   std::vector<tstring> expected_values({"a|b|c", "b|c|d", "e|f"});
283   std::vector<int64_t> expected_splits({0, 2, 2, 3});
284 
285   assert_string_equal(expected_values, *GetOutput(0));
286   assert_int64_equal(expected_splits, *GetOutput(1));
287 }
288 
TEST_F(NgramKernelTest,TestUnpaddedTrigramsAndQuadgramsWithPreserveShort)289 TEST_F(NgramKernelTest, TestUnpaddedTrigramsAndQuadgramsWithPreserveShort) {
290   MakeOp("|", {4, 3}, "", "", 0, true);
291   // Batch items are:
292   // 0: "a", "b", "c", "d"
293   // 1: "e", "f"
294   AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
295   AddInputFromArray<int64_t>(TensorShape({3}), {0, 4, 6});
296   TF_ASSERT_OK(RunOpKernel());
297 
298   std::vector<tstring> expected_values({"a|b|c|d", "a|b|c", "b|c|d", "e|f"});
299   std::vector<int64_t> expected_splits({0, 3, 4});
300 
301   assert_string_equal(expected_values, *GetOutput(0));
302   assert_int64_equal(expected_splits, *GetOutput(1));
303 }
304 
TEST_F(NgramKernelTest,TestUnpaddedBigramsAndTrigrams)305 TEST_F(NgramKernelTest, TestUnpaddedBigramsAndTrigrams) {
306   MakeOp("|", {2, 3}, "", "", 0, false);
307   // Batch items are:
308   // 0: "a", "b", "c", "d"
309   // 1: "e", "f"
310   AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
311   AddInputFromArray<int64_t>(TensorShape({3}), {0, 4, 6});
312   TF_ASSERT_OK(RunOpKernel());
313 
314   std::vector<tstring> expected_values(
315       {"a|b", "b|c", "c|d", "a|b|c", "b|c|d", "e|f"});
316   std::vector<int64_t> expected_splits({0, 5, 6});
317 
318   assert_string_equal(expected_values, *GetOutput(0));
319   assert_int64_equal(expected_splits, *GetOutput(1));
320 }
321 
TEST_F(NgramKernelTest,TestUnpaddedBigramsAndTrigramsWithPreserveShort)322 TEST_F(NgramKernelTest, TestUnpaddedBigramsAndTrigramsWithPreserveShort) {
323   MakeOp("|", {2, 3}, "", "", 0, true);
324   // Batch items are:
325   // 0: "a", "b", "c", "d"
326   // 1: "e", "f"
327   AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
328   AddInputFromArray<int64_t>(TensorShape({3}), {0, 4, 6});
329   TF_ASSERT_OK(RunOpKernel());
330 
331   // Note that in this case, because the bigram 'e|f' was already generated,
332   // the op will not generate a special preserve_short bigram.
333   std::vector<tstring> expected_values(
334       {"a|b", "b|c", "c|d", "a|b|c", "b|c|d", "e|f"});
335   std::vector<int64_t> expected_splits({0, 5, 6});
336 
337   assert_string_equal(expected_values, *GetOutput(0));
338   assert_int64_equal(expected_splits, *GetOutput(1));
339 }
340 
TEST_F(NgramKernelTest,TestUnpaddedTrigramsAndBigramsWithPreserveShort)341 TEST_F(NgramKernelTest, TestUnpaddedTrigramsAndBigramsWithPreserveShort) {
342   MakeOp("|", {3, 2}, "", "", 0, true);
343   // Batch items are:
344   // 0: "a", "b", "c", "d"
345   // 1: "e", "f"
346   AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
347   AddInputFromArray<int64_t>(TensorShape({3}), {0, 4, 6});
348   TF_ASSERT_OK(RunOpKernel());
349 
350   // Note that in this case, because the bigram 'e|f' was already generated,
351   // the op will not generate a special preserve_short bigram.
352   std::vector<tstring> expected_values(
353       {"a|b|c", "b|c|d", "a|b", "b|c", "c|d", "e|f"});
354   std::vector<int64_t> expected_splits({0, 5, 6});
355 
356   assert_string_equal(expected_values, *GetOutput(0));
357   assert_int64_equal(expected_splits, *GetOutput(1));
358 }
359 
TEST_F(NgramKernelTest,TestUnpaddedBigrams)360 TEST_F(NgramKernelTest, TestUnpaddedBigrams) {
361   MakeOp("|", {2}, "", "", 0, false);
362   // Batch items are:
363   // 0: "a", "b", "c", "d"
364   // 1: "e", "f"
365   AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
366   AddInputFromArray<int64_t>(TensorShape({3}), {0, 4, 6});
367   TF_ASSERT_OK(RunOpKernel());
368 
369   std::vector<tstring> expected_values({"a|b", "b|c", "c|d", "e|f"});
370   std::vector<int64_t> expected_splits({0, 3, 4});
371 
372   assert_string_equal(expected_values, *GetOutput(0));
373   assert_int64_equal(expected_splits, *GetOutput(1));
374 }
375 
TEST_F(NgramKernelTest,TestOverlappingUnpaddedNGrams)376 TEST_F(NgramKernelTest, TestOverlappingUnpaddedNGrams) {
377   MakeOp("|", {3}, "", "", 0, false);
378   // Batch items are:
379   // 0: "a"
380   // 1: "b", "c", "d"
381   // 2: "e", "f"
382   AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
383   AddInputFromArray<int64_t>(TensorShape({4}), {0, 1, 4, 6});
384   TF_ASSERT_OK(RunOpKernel());
385 
386   std::vector<tstring> expected_values({"b|c|d"});
387   std::vector<int64_t> expected_splits({0, 0, 1, 1});
388 
389   assert_string_equal(expected_values, *GetOutput(0));
390   assert_int64_equal(expected_splits, *GetOutput(1));
391 }
392 
TEST_F(NgramKernelTest,TestOverlappingUnpaddedNGramsNoOutput)393 TEST_F(NgramKernelTest, TestOverlappingUnpaddedNGramsNoOutput) {
394   MakeOp("|", {5}, "", "", 0, false);
395   // Batch items are:
396   // 0: "a"
397   // 1: "b", "c", "d"
398   // 2: "e", "f"
399   AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
400   AddInputFromArray<int64_t>(TensorShape({4}), {0, 1, 4, 6});
401   TF_ASSERT_OK(RunOpKernel());
402 
403   std::vector<tstring> expected_values({});
404   std::vector<int64_t> expected_splits({0, 0, 0, 0});
405 
406   assert_string_equal(expected_values, *GetOutput(0));
407   assert_int64_equal(expected_splits, *GetOutput(1));
408 }
409 
TEST_F(NgramKernelTest,TestSinglyPaddedTrigrams)410 TEST_F(NgramKernelTest, TestSinglyPaddedTrigrams) {
411   MakeOp("|", {3}, "LP", "RP", 1, false);
412   // Batch items are:
413   // 0: "a", "b", "c", "d"
414   // 1: "e", "f"
415   AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
416   AddInputFromArray<int64_t>(TensorShape({3}), {0, 4, 6});
417   TF_ASSERT_OK(RunOpKernel());
418 
419   std::vector<tstring> expected_values({"LP|a|b", "a|b|c", "b|c|d",
420                                         "c|d|RP",  //
421                                         "LP|e|f", "e|f|RP"});
422   std::vector<int64_t> expected_splits({0, 4, 6});
423 
424   assert_string_equal(expected_values, *GetOutput(0));
425   assert_int64_equal(expected_splits, *GetOutput(1));
426 }
427 
TEST_F(NgramKernelTest,TestSinglyPaddedBigrams)428 TEST_F(NgramKernelTest, TestSinglyPaddedBigrams) {
429   MakeOp("|", {2}, "LP", "RP", 1, false);
430   // Batch items are:
431   // 0: "a", "b", "c", "d"
432   // 1: "e", "f"
433   AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
434   AddInputFromArray<int64_t>(TensorShape({3}), {0, 4, 6});
435   TF_ASSERT_OK(RunOpKernel());
436 
437   std::vector<tstring> expected_values({"LP|a", "a|b", "b|c", "c|d", "d|RP",  //
438                                         "LP|e", "e|f", "f|RP"});
439   std::vector<int64_t> expected_splits({0, 5, 8});
440 
441   assert_string_equal(expected_values, *GetOutput(0));
442   assert_int64_equal(expected_splits, *GetOutput(1));
443 }
444 
TEST_F(NgramKernelTest,TestSinglyPaddedBigramsAnd5grams)445 TEST_F(NgramKernelTest, TestSinglyPaddedBigramsAnd5grams) {
446   MakeOp("|", {2, 5}, "LP", "RP", 1, false);
447   // Batch items are:
448   // 0: "a", "b", "c", "d"
449   // 1: "e", "f"
450   AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
451   AddInputFromArray<int64_t>(TensorShape({3}), {0, 4, 6});
452   TF_ASSERT_OK(RunOpKernel());
453 
454   std::vector<tstring> expected_values(                                  //
455       {"LP|a", "a|b", "b|c", "c|d", "d|RP", "LP|a|b|c|d", "a|b|c|d|RP",  //
456        "LP|e", "e|f", "f|RP"});
457   std::vector<int64_t> expected_splits({0, 7, 10});
458 
459   assert_string_equal(expected_values, *GetOutput(0));
460   assert_int64_equal(expected_splits, *GetOutput(1));
461 }
462 
TEST_F(NgramKernelTest,TestSinglyPadded5gramsWithPreserveShort)463 TEST_F(NgramKernelTest, TestSinglyPadded5gramsWithPreserveShort) {
464   MakeOp("|", {5}, "LP", "RP", 1, true);
465   // Batch items are:
466   // 0: "a", "b", "c", "d"
467   // 1: "e", "f"
468   AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
469   AddInputFromArray<int64_t>(TensorShape({3}), {0, 4, 6});
470   TF_ASSERT_OK(RunOpKernel());
471 
472   std::vector<tstring> expected_values(  //
473       {"LP|a|b|c|d", "a|b|c|d|RP",       //
474        "LP|e|f|RP"});
475   std::vector<int64_t> expected_splits({0, 2, 3});
476 
477   assert_string_equal(expected_values, *GetOutput(0));
478   assert_int64_equal(expected_splits, *GetOutput(1));
479 }
480 
TEST_F(NgramKernelTest,TestOverlappingSinglyPaddedNGrams)481 TEST_F(NgramKernelTest, TestOverlappingSinglyPaddedNGrams) {
482   MakeOp("|", {3}, "LP", "RP", 1, false);
483   // Batch items are:
484   // 0: "a"
485   // 1: "b", "c", "d"
486   // 2: "e", "f"
487   AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
488   AddInputFromArray<int64_t>(TensorShape({4}), {0, 1, 4, 6});
489   TF_ASSERT_OK(RunOpKernel());
490 
491   std::vector<tstring> expected_values(
492       {"LP|a|RP",                    // ngrams for elem. 0
493        "LP|b|c", "b|c|d", "c|d|RP",  // ngrams for elem. 1
494        "LP|e|f", "e|f|RP"});         // ngrams for elem. 2
495   std::vector<int64_t> expected_splits({0, 1, 4, 6});
496 
497   assert_string_equal(expected_values, *GetOutput(0));
498   assert_int64_equal(expected_splits, *GetOutput(1));
499 }
500 
TEST_F(NgramKernelTest,TestOverlappingSinglyPaddedNGramsNoOutput)501 TEST_F(NgramKernelTest, TestOverlappingSinglyPaddedNGramsNoOutput) {
502   MakeOp("|", {5}, "LP", "RP", 1, false);
503   // Batch items are:
504   // 0: "a"
505   // 1: "b", "c", "d"
506   // 2: "e", "f"
507   AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
508   AddInputFromArray<int64_t>(TensorShape({4}), {0, 1, 4, 6});
509   TF_ASSERT_OK(RunOpKernel());
510 
511   std::vector<tstring> expected_values({"LP|b|c|d|RP"});
512   std::vector<int64_t> expected_splits({0, 0, 1, 1});
513 
514   assert_string_equal(expected_values, *GetOutput(0));
515   assert_int64_equal(expected_splits, *GetOutput(1));
516 }
517 
TEST_F(NgramKernelTest,TestSinglyPaddedUnigrams)518 TEST_F(NgramKernelTest, TestSinglyPaddedUnigrams) {
519   MakeOp("|", {1}, "LP", "RP", 1, false);
520   // Batch items are:
521   // 0: "a", "b", "c", "d"
522   // 1: "e", "f"
523   AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
524   AddInputFromArray<int64_t>(TensorShape({3}), {0, 4, 6});
525   TF_ASSERT_OK(RunOpKernel());
526 
527   std::vector<tstring> expected_values({"a", "b", "c", "d", "e", "f"});
528   std::vector<int64_t> expected_splits({0, 4, 6});
529 
530   assert_string_equal(expected_values, *GetOutput(0));
531   assert_int64_equal(expected_splits, *GetOutput(1));
532 }
533 
TEST_F(NgramKernelTest,TestEmptyInput)534 TEST_F(NgramKernelTest, TestEmptyInput) {
535   MakeOp("|", {1}, "LP", "RP", 3, false);
536   AddInputFromArray<tstring>(TensorShape({0}), {});
537   AddInputFromArray<int64_t>(TensorShape({0}), {});
538   TF_ASSERT_OK(RunOpKernel());
539 
540   std::vector<tstring> expected_values({});
541   std::vector<int64_t> expected_splits({});
542 
543   assert_string_equal(expected_values, *GetOutput(0));
544   assert_int64_equal(expected_splits, *GetOutput(1));
545 }
546 
TEST_F(NgramKernelTest,TestNoTokens)547 TEST_F(NgramKernelTest, TestNoTokens) {
548   MakeOp("|", {3}, "L", "R", -1, false);
549   // Batch items are:
550   // 0:
551   // 1: "a"
552   AddInputFromArray<tstring>(TensorShape({1}), {"a"});
553   AddInputFromArray<int64_t>(TensorShape({3}), {0, 0, 1});
554   TF_ASSERT_OK(RunOpKernel());
555 
556   std::vector<tstring> expected_values(
557       {"L|L|R", "L|R|R",             // no input in first split
558        "L|L|a", "L|a|R", "a|R|R"});  // second split
559   std::vector<int64_t> expected_splits({0, 2, 5});
560 
561   assert_string_equal(expected_values, *GetOutput(0));
562   assert_int64_equal(expected_splits, *GetOutput(1));
563 }
564 
TEST_F(NgramKernelTest,TestNoTokensNoPad)565 TEST_F(NgramKernelTest, TestNoTokensNoPad) {
566   MakeOp("|", {3}, "", "", 0, false);
567   // Batch items are:
568   // 0:
569   // 1: "a"
570   AddInputFromArray<tstring>(TensorShape({1}), {"a"});
571   AddInputFromArray<int64_t>(TensorShape({3}), {0, 0, 1});
572   TF_ASSERT_OK(RunOpKernel());
573 
574   std::vector<tstring> expected_values({});
575   std::vector<int64_t> expected_splits({0, 0, 0});
576 
577   assert_string_equal(expected_values, *GetOutput(0));
578   assert_int64_equal(expected_splits, *GetOutput(1));
579 }
580 
TEST_F(NgramKernelTest,TestNoTokens)581 TEST_F(NgramKernelTest, TestNoTokens) {
582   MakeOp("|", {3}, "L", "R", -1, false);
583   // Batch items are:
584   // 0:
585   // 1: "a"
586   AddInputFromArray<tstring>(TensorShape({1}), {"a"});
587   AddInputFromArray<int64>(TensorShape({3}), {0, 0, 1});
588   TF_ASSERT_OK(RunOpKernel());
589 
590   std::vector<tstring> expected_values(
591       {"L|L|R", "L|R|R",             // no input in first split
592        "L|L|a", "L|a|R", "a|R|R"});  // second split
593   std::vector<int64> expected_splits({0, 2, 5});
594 
595   assert_string_equal(expected_values, *GetOutput(0));
596   assert_int64_equal(expected_splits, *GetOutput(1));
597 }
598 
TEST_F(NgramKernelTest,TestNoTokensNoPad)599 TEST_F(NgramKernelTest, TestNoTokensNoPad) {
600   MakeOp("|", {3}, "", "", 0, false);
601   // Batch items are:
602   // 0:
603   // 1: "a"
604   AddInputFromArray<tstring>(TensorShape({1}), {"a"});
605   AddInputFromArray<int64>(TensorShape({3}), {0, 0, 1});
606   TF_ASSERT_OK(RunOpKernel());
607 
608   std::vector<tstring> expected_values({});
609   std::vector<int64> expected_splits({0, 0, 0});
610 
611   assert_string_equal(expected_values, *GetOutput(0));
612   assert_int64_equal(expected_splits, *GetOutput(1));
613 }
614 
TEST_F(NgramKernelTest,ShapeFn)615 TEST_F(NgramKernelTest, ShapeFn) {
616   ShapeInferenceTestOp op("StringNGrams");
617   INFER_OK(op, "?;?", "[?];[?]");
618   INFER_OK(op, "[1];?", "[?];[?]");
619   INFER_OK(op, "[1];[2]", "[?];in1");
620   INFER_ERROR("Shape must be rank 1 but is rank 0", op, "[];?");
621   INFER_ERROR("Shape must be rank 1 but is rank 0", op, "?;[]");
622 }
623 
624 }  // namespace text
625 }  // namespace tensorflow
626