1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #include <vector>
16
17 #include "tensorflow/core/framework/fake_input.h"
18 #include "tensorflow/core/framework/node_def_builder.h"
19 #include "tensorflow/core/framework/shape_inference.h"
20 #include "tensorflow/core/framework/shape_inference_testutil.h"
21 #include "tensorflow/core/framework/tensor.h"
22 #include "tensorflow/core/framework/tensor_shape.h"
23 #include "tensorflow/core/framework/tensor_testutil.h"
24 #include "tensorflow/core/framework/types.pb.h"
25 #include "tensorflow/core/kernels/ops_testutil.h"
26 #include "tensorflow/core/lib/core/status.h"
27 #include "tensorflow/core/lib/core/status_test_util.h"
28
29 namespace tensorflow {
30 namespace text {
31
32 using tensorflow::FakeInput;
33 using tensorflow::NodeDefBuilder;
34 using tensorflow::Status;
35 using tensorflow::TensorShape;
36
37 class NgramKernelTest : public tensorflow::OpsTestBase {
38 public:
MakeOp(string separator,std::vector<int> ngram_width,string left_pad,string right_pad,int pad_width,bool preserve)39 void MakeOp(string separator, std::vector<int> ngram_width, string left_pad,
40 string right_pad, int pad_width, bool preserve) {
41 TF_ASSERT_OK(NodeDefBuilder("tested_op", "StringNGrams")
42 .Attr("separator", separator)
43 .Attr("ngram_widths", ngram_width)
44 .Attr("left_pad", left_pad)
45 .Attr("right_pad", right_pad)
46 .Attr("pad_width", pad_width)
47 .Attr("preserve_short_sequences", preserve)
48 .Input(FakeInput())
49 .Input(FakeInput())
50 .Finalize(node_def()));
51 TF_ASSERT_OK(InitOp());
52 }
53
assert_string_equal(const std::vector<tstring> & expected,const Tensor & value)54 void assert_string_equal(const std::vector<tstring> &expected,
55 const Tensor &value) {
56 Tensor expected_tensor(
57 allocator(), DT_STRING,
58 TensorShape({static_cast<int64_t>(expected.size())}));
59 test::FillValues<tstring>(&expected_tensor, expected);
60 test::ExpectTensorEqual<tstring>(expected_tensor, value);
61 }
assert_int64_equal(const std::vector<int64_t> & expected,const Tensor & value)62 void assert_int64_equal(const std::vector<int64_t> &expected,
63 const Tensor &value) {
64 Tensor expected_tensor(
65 allocator(), DT_INT64,
66 TensorShape({static_cast<int64_t>(expected.size())}));
67 test::FillValues<int64_t>(&expected_tensor, expected);
68 test::ExpectTensorEqual<int64_t>(expected_tensor, value);
69 }
70 };
71
TEST_F(NgramKernelTest,TestPaddedTrigrams)72 TEST_F(NgramKernelTest, TestPaddedTrigrams) {
73 MakeOp("|", {3}, "LP", "RP", -1, false);
74 // Batch items are:
75 // 0: "a", "b", "c", "d"
76 // 1: "e", "f"
77 AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
78 AddInputFromArray<int64_t>(TensorShape({3}), {0, 4, 6});
79 TF_ASSERT_OK(RunOpKernel());
80
81 std::vector<tstring> expected_values( //
82 {"LP|LP|a", "LP|a|b", "a|b|c", "b|c|d", "c|d|RP", "d|RP|RP", // 0
83 "LP|LP|e", "LP|e|f", "e|f|RP", "f|RP|RP"}); // 1
84 std::vector<int64_t> expected_splits({0, 6, 10});
85
86 assert_string_equal(expected_values, *GetOutput(0));
87 assert_int64_equal(expected_splits, *GetOutput(1));
88 }
89
TEST_F(NgramKernelTest,TestPaddedBigramsAndTrigrams)90 TEST_F(NgramKernelTest, TestPaddedBigramsAndTrigrams) {
91 MakeOp("|", {2, 3}, "LP", "RP", -1, false);
92 // Batch items are:
93 // 0: "a", "b", "c", "d"
94 // 1: "e", "f"
95 AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
96 AddInputFromArray<int64_t>(TensorShape({3}), {0, 4, 6});
97 TF_ASSERT_OK(RunOpKernel());
98
99 std::vector<tstring> expected_values(
100 {"LP|a", "a|b", "b|c", "c|d", "d|RP", "LP|LP|a", "LP|a|b", "a|b|c",
101 "b|c|d", "c|d|RP", "d|RP|RP", // 0
102 "LP|e", "e|f", "f|RP", "LP|LP|e", "LP|e|f", "e|f|RP", "f|RP|RP"}); // 1
103 std::vector<int64_t> expected_splits({0, 11, 18});
104
105 assert_string_equal(expected_values, *GetOutput(0));
106 assert_int64_equal(expected_splits, *GetOutput(1));
107 }
108
TEST_F(NgramKernelTest,TestPaddedBigrams)109 TEST_F(NgramKernelTest, TestPaddedBigrams) {
110 MakeOp("|", {2}, "LP", "RP", -1, false);
111 // Batch items are:
112 // 0: "a", "b", "c", "d"
113 // 1: "e", "f"
114 AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
115 AddInputFromArray<int64_t>(TensorShape({3}), {0, 4, 6});
116 TF_ASSERT_OK(RunOpKernel());
117
118 std::vector<tstring> expected_values( //
119 {"LP|a", "a|b", "b|c", "c|d", "d|RP", // 0
120 "LP|e", "e|f", "f|RP"}); // 1
121 std::vector<int64_t> expected_splits({0, 5, 8});
122
123 assert_string_equal(expected_values, *GetOutput(0));
124 assert_int64_equal(expected_splits, *GetOutput(1));
125 }
126
TEST_F(NgramKernelTest,TestPaddingIsAtMostNGramSizeMinus1)127 TEST_F(NgramKernelTest, TestPaddingIsAtMostNGramSizeMinus1) {
128 MakeOp("|", {2}, "LP", "RP", 4, false);
129 // Batch items are:
130 // 0: "a", "b", "c", "d"
131 // 1: "e", "f"
132 AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
133 AddInputFromArray<int64_t>(TensorShape({3}), {0, 4, 6});
134 TF_ASSERT_OK(RunOpKernel());
135
136 std::vector<tstring> expected_values( //
137 {"LP|a", "a|b", "b|c", "c|d", "d|RP", // 0
138 "LP|e", "e|f", "f|RP"}); // 1
139 std::vector<int64_t> expected_splits({0, 5, 8});
140
141 assert_string_equal(expected_values, *GetOutput(0));
142 assert_int64_equal(expected_splits, *GetOutput(1));
143 }
144
TEST_F(NgramKernelTest,TestPaddedUnigramAndBigrams)145 TEST_F(NgramKernelTest, TestPaddedUnigramAndBigrams) {
146 MakeOp("|", {1, 2}, "LP", "RP", -1, false);
147 // Batch items are:
148 // 0: "a", "b", "c", "d"
149 // 1: "e", "f"
150 AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
151 AddInputFromArray<int64_t>(TensorShape({3}), {0, 4, 6});
152 TF_ASSERT_OK(RunOpKernel());
153
154 std::vector<tstring> expected_values( //
155 {"a", "b", "c", "d", "LP|a", "a|b", "b|c", "c|d", "d|RP", // 0
156 "e", "f", "LP|e", "e|f", "f|RP"}); // 1
157 std::vector<int64_t> expected_splits({0, 9, 14});
158
159 assert_string_equal(expected_values, *GetOutput(0));
160 assert_int64_equal(expected_splits, *GetOutput(1));
161 }
162
TEST_F(NgramKernelTest,TestOverlappingPaddedNGrams)163 TEST_F(NgramKernelTest, TestOverlappingPaddedNGrams) {
164 // This test validates that n-grams with both left and right padding in a
165 // single ngram token are created correctly.
166 MakeOp("|", {3}, "LP", "RP", -1, false);
167 // Batch items are:
168 // 0: "a"
169 // 1: "b", "c", "d"
170 // 2: "e", "f"
171 AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
172 AddInputFromArray<int64_t>(TensorShape({4}), {0, 1, 4, 6});
173 TF_ASSERT_OK(RunOpKernel());
174
175 std::vector<tstring> expected_values( //
176 {"LP|LP|a", "LP|a|RP", "a|RP|RP", // ngrams for elem. 0
177 "LP|LP|b", "LP|b|c", "b|c|d", "c|d|RP", "d|RP|RP", // ngrams for elem. 1
178 "LP|LP|e", "LP|e|f", "e|f|RP", "f|RP|RP"}); // ngrams for elem. 2
179 std::vector<int64_t> expected_splits({0, 3, 8, 12});
180
181 assert_string_equal(expected_values, *GetOutput(0));
182 assert_int64_equal(expected_splits, *GetOutput(1));
183 }
184
TEST_F(NgramKernelTest,TestOverlappingPaddedMultiCharNGrams)185 TEST_F(NgramKernelTest, TestOverlappingPaddedMultiCharNGrams) {
186 MakeOp("|", {3}, "LP", "RP", -1, false);
187 // Batch items are:
188 // 0: "a"
189 // 1: "b", "c", "d"
190 // 2: "e", "f"
191 AddInputFromArray<tstring>(TensorShape({6}),
192 {"aa", "bb", "cc", "dd", "ee", "ff"});
193 AddInputFromArray<int64_t>(TensorShape({4}), {0, 1, 4, 6});
194 TF_ASSERT_OK(RunOpKernel());
195
196 std::vector<tstring> expected_values( //
197 {"LP|LP|aa", "LP|aa|RP", "aa|RP|RP", //
198 "LP|LP|bb", "LP|bb|cc", "bb|cc|dd", "cc|dd|RP", "dd|RP|RP", //
199 "LP|LP|ee", "LP|ee|ff", "ee|ff|RP", "ff|RP|RP"}); //
200 std::vector<int64_t> expected_splits({0, 3, 8, 12});
201
202 assert_string_equal(expected_values, *GetOutput(0));
203 assert_int64_equal(expected_splits, *GetOutput(1));
204 }
205
TEST_F(NgramKernelTest,TestMultiOverlappingPaddedNGrams)206 TEST_F(NgramKernelTest, TestMultiOverlappingPaddedNGrams) {
207 // This test validates that n-grams with more than 1 padding value on each
208 // side are created correctly.
209 MakeOp("|", {5}, "LP", "RP", -1, false);
210 // Batch items are:
211 // 0: "a"
212 AddInputFromArray<tstring>(TensorShape({1}), {"a"});
213 AddInputFromArray<int64_t>(TensorShape({2}), {0, 1});
214 TF_ASSERT_OK(RunOpKernel());
215
216 std::vector<tstring> expected_values({"LP|LP|LP|LP|a", "LP|LP|LP|a|RP",
217 "LP|LP|a|RP|RP", "LP|a|RP|RP|RP",
218 "a|RP|RP|RP|RP"});
219 std::vector<int64_t> expected_splits({0, 5});
220
221 assert_string_equal(expected_values, *GetOutput(0));
222 assert_int64_equal(expected_splits, *GetOutput(1));
223 }
224
TEST_F(NgramKernelTest,TestUnpaddedTrigrams)225 TEST_F(NgramKernelTest, TestUnpaddedTrigrams) {
226 MakeOp("|", {3}, "", "", 0, false);
227 // Batch items are:
228 // 0: "a", "b", "c", "d"
229 // 1: "e", "f"
230 AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
231 AddInputFromArray<int64_t>(TensorShape({3}), {0, 4, 6});
232 TF_ASSERT_OK(RunOpKernel());
233
234 std::vector<tstring> expected_values({"a|b|c", "b|c|d"});
235 std::vector<int64_t> expected_splits({0, 2, 2});
236
237 assert_string_equal(expected_values, *GetOutput(0));
238 assert_int64_equal(expected_splits, *GetOutput(1));
239 }
240
TEST_F(NgramKernelTest,TestUnpaddedTrigramsWithEmptySequence)241 TEST_F(NgramKernelTest, TestUnpaddedTrigramsWithEmptySequence) {
242 MakeOp("|", {3}, "", "", 0, false);
243 // Batch items are:
244 // 0: "a", "b", "c", "d"
245 // 1: "e", "f"
246 AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
247 AddInputFromArray<int64_t>(TensorShape({4}), {0, 4, 4, 6});
248 TF_ASSERT_OK(RunOpKernel());
249
250 std::vector<tstring> expected_values({"a|b|c", "b|c|d"});
251 std::vector<int64_t> expected_splits({0, 2, 2, 2});
252
253 assert_string_equal(expected_values, *GetOutput(0));
254 assert_int64_equal(expected_splits, *GetOutput(1));
255 }
256
TEST_F(NgramKernelTest,TestUnpaddedTrigramsWithPreserveShort)257 TEST_F(NgramKernelTest, TestUnpaddedTrigramsWithPreserveShort) {
258 MakeOp("|", {3}, "", "", 0, true);
259 // Batch items are:
260 // 0: "a", "b", "c", "d"
261 // 1: "e", "f"
262 AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
263 AddInputFromArray<int64_t>(TensorShape({3}), {0, 4, 6});
264 TF_ASSERT_OK(RunOpKernel());
265
266 std::vector<tstring> expected_values({"a|b|c", "b|c|d", "e|f"});
267 std::vector<int64_t> expected_splits({0, 2, 3});
268
269 assert_string_equal(expected_values, *GetOutput(0));
270 assert_int64_equal(expected_splits, *GetOutput(1));
271 }
272
TEST_F(NgramKernelTest,TestUnpaddedTrigramsWithPreserveShortAndEmptySequence)273 TEST_F(NgramKernelTest, TestUnpaddedTrigramsWithPreserveShortAndEmptySequence) {
274 MakeOp("|", {3}, "", "", 0, true);
275 // Batch items are:
276 // 0: "a", "b", "c", "d"
277 // 1: "e", "f"
278 AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
279 AddInputFromArray<int64_t>(TensorShape({4}), {0, 4, 4, 6});
280 TF_ASSERT_OK(RunOpKernel());
281
282 std::vector<tstring> expected_values({"a|b|c", "b|c|d", "e|f"});
283 std::vector<int64_t> expected_splits({0, 2, 2, 3});
284
285 assert_string_equal(expected_values, *GetOutput(0));
286 assert_int64_equal(expected_splits, *GetOutput(1));
287 }
288
TEST_F(NgramKernelTest,TestUnpaddedTrigramsAndQuadgramsWithPreserveShort)289 TEST_F(NgramKernelTest, TestUnpaddedTrigramsAndQuadgramsWithPreserveShort) {
290 MakeOp("|", {4, 3}, "", "", 0, true);
291 // Batch items are:
292 // 0: "a", "b", "c", "d"
293 // 1: "e", "f"
294 AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
295 AddInputFromArray<int64_t>(TensorShape({3}), {0, 4, 6});
296 TF_ASSERT_OK(RunOpKernel());
297
298 std::vector<tstring> expected_values({"a|b|c|d", "a|b|c", "b|c|d", "e|f"});
299 std::vector<int64_t> expected_splits({0, 3, 4});
300
301 assert_string_equal(expected_values, *GetOutput(0));
302 assert_int64_equal(expected_splits, *GetOutput(1));
303 }
304
TEST_F(NgramKernelTest,TestUnpaddedBigramsAndTrigrams)305 TEST_F(NgramKernelTest, TestUnpaddedBigramsAndTrigrams) {
306 MakeOp("|", {2, 3}, "", "", 0, false);
307 // Batch items are:
308 // 0: "a", "b", "c", "d"
309 // 1: "e", "f"
310 AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
311 AddInputFromArray<int64_t>(TensorShape({3}), {0, 4, 6});
312 TF_ASSERT_OK(RunOpKernel());
313
314 std::vector<tstring> expected_values(
315 {"a|b", "b|c", "c|d", "a|b|c", "b|c|d", "e|f"});
316 std::vector<int64_t> expected_splits({0, 5, 6});
317
318 assert_string_equal(expected_values, *GetOutput(0));
319 assert_int64_equal(expected_splits, *GetOutput(1));
320 }
321
TEST_F(NgramKernelTest,TestUnpaddedBigramsAndTrigramsWithPreserveShort)322 TEST_F(NgramKernelTest, TestUnpaddedBigramsAndTrigramsWithPreserveShort) {
323 MakeOp("|", {2, 3}, "", "", 0, true);
324 // Batch items are:
325 // 0: "a", "b", "c", "d"
326 // 1: "e", "f"
327 AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
328 AddInputFromArray<int64_t>(TensorShape({3}), {0, 4, 6});
329 TF_ASSERT_OK(RunOpKernel());
330
331 // Note that in this case, because the bigram 'e|f' was already generated,
332 // the op will not generate a special preserve_short bigram.
333 std::vector<tstring> expected_values(
334 {"a|b", "b|c", "c|d", "a|b|c", "b|c|d", "e|f"});
335 std::vector<int64_t> expected_splits({0, 5, 6});
336
337 assert_string_equal(expected_values, *GetOutput(0));
338 assert_int64_equal(expected_splits, *GetOutput(1));
339 }
340
TEST_F(NgramKernelTest,TestUnpaddedTrigramsAndBigramsWithPreserveShort)341 TEST_F(NgramKernelTest, TestUnpaddedTrigramsAndBigramsWithPreserveShort) {
342 MakeOp("|", {3, 2}, "", "", 0, true);
343 // Batch items are:
344 // 0: "a", "b", "c", "d"
345 // 1: "e", "f"
346 AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
347 AddInputFromArray<int64_t>(TensorShape({3}), {0, 4, 6});
348 TF_ASSERT_OK(RunOpKernel());
349
350 // Note that in this case, because the bigram 'e|f' was already generated,
351 // the op will not generate a special preserve_short bigram.
352 std::vector<tstring> expected_values(
353 {"a|b|c", "b|c|d", "a|b", "b|c", "c|d", "e|f"});
354 std::vector<int64_t> expected_splits({0, 5, 6});
355
356 assert_string_equal(expected_values, *GetOutput(0));
357 assert_int64_equal(expected_splits, *GetOutput(1));
358 }
359
TEST_F(NgramKernelTest,TestUnpaddedBigrams)360 TEST_F(NgramKernelTest, TestUnpaddedBigrams) {
361 MakeOp("|", {2}, "", "", 0, false);
362 // Batch items are:
363 // 0: "a", "b", "c", "d"
364 // 1: "e", "f"
365 AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
366 AddInputFromArray<int64_t>(TensorShape({3}), {0, 4, 6});
367 TF_ASSERT_OK(RunOpKernel());
368
369 std::vector<tstring> expected_values({"a|b", "b|c", "c|d", "e|f"});
370 std::vector<int64_t> expected_splits({0, 3, 4});
371
372 assert_string_equal(expected_values, *GetOutput(0));
373 assert_int64_equal(expected_splits, *GetOutput(1));
374 }
375
TEST_F(NgramKernelTest,TestOverlappingUnpaddedNGrams)376 TEST_F(NgramKernelTest, TestOverlappingUnpaddedNGrams) {
377 MakeOp("|", {3}, "", "", 0, false);
378 // Batch items are:
379 // 0: "a"
380 // 1: "b", "c", "d"
381 // 2: "e", "f"
382 AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
383 AddInputFromArray<int64_t>(TensorShape({4}), {0, 1, 4, 6});
384 TF_ASSERT_OK(RunOpKernel());
385
386 std::vector<tstring> expected_values({"b|c|d"});
387 std::vector<int64_t> expected_splits({0, 0, 1, 1});
388
389 assert_string_equal(expected_values, *GetOutput(0));
390 assert_int64_equal(expected_splits, *GetOutput(1));
391 }
392
TEST_F(NgramKernelTest,TestOverlappingUnpaddedNGramsNoOutput)393 TEST_F(NgramKernelTest, TestOverlappingUnpaddedNGramsNoOutput) {
394 MakeOp("|", {5}, "", "", 0, false);
395 // Batch items are:
396 // 0: "a"
397 // 1: "b", "c", "d"
398 // 2: "e", "f"
399 AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
400 AddInputFromArray<int64_t>(TensorShape({4}), {0, 1, 4, 6});
401 TF_ASSERT_OK(RunOpKernel());
402
403 std::vector<tstring> expected_values({});
404 std::vector<int64_t> expected_splits({0, 0, 0, 0});
405
406 assert_string_equal(expected_values, *GetOutput(0));
407 assert_int64_equal(expected_splits, *GetOutput(1));
408 }
409
TEST_F(NgramKernelTest,TestSinglyPaddedTrigrams)410 TEST_F(NgramKernelTest, TestSinglyPaddedTrigrams) {
411 MakeOp("|", {3}, "LP", "RP", 1, false);
412 // Batch items are:
413 // 0: "a", "b", "c", "d"
414 // 1: "e", "f"
415 AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
416 AddInputFromArray<int64_t>(TensorShape({3}), {0, 4, 6});
417 TF_ASSERT_OK(RunOpKernel());
418
419 std::vector<tstring> expected_values({"LP|a|b", "a|b|c", "b|c|d",
420 "c|d|RP", //
421 "LP|e|f", "e|f|RP"});
422 std::vector<int64_t> expected_splits({0, 4, 6});
423
424 assert_string_equal(expected_values, *GetOutput(0));
425 assert_int64_equal(expected_splits, *GetOutput(1));
426 }
427
TEST_F(NgramKernelTest,TestSinglyPaddedBigrams)428 TEST_F(NgramKernelTest, TestSinglyPaddedBigrams) {
429 MakeOp("|", {2}, "LP", "RP", 1, false);
430 // Batch items are:
431 // 0: "a", "b", "c", "d"
432 // 1: "e", "f"
433 AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
434 AddInputFromArray<int64_t>(TensorShape({3}), {0, 4, 6});
435 TF_ASSERT_OK(RunOpKernel());
436
437 std::vector<tstring> expected_values({"LP|a", "a|b", "b|c", "c|d", "d|RP", //
438 "LP|e", "e|f", "f|RP"});
439 std::vector<int64_t> expected_splits({0, 5, 8});
440
441 assert_string_equal(expected_values, *GetOutput(0));
442 assert_int64_equal(expected_splits, *GetOutput(1));
443 }
444
TEST_F(NgramKernelTest,TestSinglyPaddedBigramsAnd5grams)445 TEST_F(NgramKernelTest, TestSinglyPaddedBigramsAnd5grams) {
446 MakeOp("|", {2, 5}, "LP", "RP", 1, false);
447 // Batch items are:
448 // 0: "a", "b", "c", "d"
449 // 1: "e", "f"
450 AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
451 AddInputFromArray<int64_t>(TensorShape({3}), {0, 4, 6});
452 TF_ASSERT_OK(RunOpKernel());
453
454 std::vector<tstring> expected_values( //
455 {"LP|a", "a|b", "b|c", "c|d", "d|RP", "LP|a|b|c|d", "a|b|c|d|RP", //
456 "LP|e", "e|f", "f|RP"});
457 std::vector<int64_t> expected_splits({0, 7, 10});
458
459 assert_string_equal(expected_values, *GetOutput(0));
460 assert_int64_equal(expected_splits, *GetOutput(1));
461 }
462
TEST_F(NgramKernelTest,TestSinglyPadded5gramsWithPreserveShort)463 TEST_F(NgramKernelTest, TestSinglyPadded5gramsWithPreserveShort) {
464 MakeOp("|", {5}, "LP", "RP", 1, true);
465 // Batch items are:
466 // 0: "a", "b", "c", "d"
467 // 1: "e", "f"
468 AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
469 AddInputFromArray<int64_t>(TensorShape({3}), {0, 4, 6});
470 TF_ASSERT_OK(RunOpKernel());
471
472 std::vector<tstring> expected_values( //
473 {"LP|a|b|c|d", "a|b|c|d|RP", //
474 "LP|e|f|RP"});
475 std::vector<int64_t> expected_splits({0, 2, 3});
476
477 assert_string_equal(expected_values, *GetOutput(0));
478 assert_int64_equal(expected_splits, *GetOutput(1));
479 }
480
TEST_F(NgramKernelTest,TestOverlappingSinglyPaddedNGrams)481 TEST_F(NgramKernelTest, TestOverlappingSinglyPaddedNGrams) {
482 MakeOp("|", {3}, "LP", "RP", 1, false);
483 // Batch items are:
484 // 0: "a"
485 // 1: "b", "c", "d"
486 // 2: "e", "f"
487 AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
488 AddInputFromArray<int64_t>(TensorShape({4}), {0, 1, 4, 6});
489 TF_ASSERT_OK(RunOpKernel());
490
491 std::vector<tstring> expected_values(
492 {"LP|a|RP", // ngrams for elem. 0
493 "LP|b|c", "b|c|d", "c|d|RP", // ngrams for elem. 1
494 "LP|e|f", "e|f|RP"}); // ngrams for elem. 2
495 std::vector<int64_t> expected_splits({0, 1, 4, 6});
496
497 assert_string_equal(expected_values, *GetOutput(0));
498 assert_int64_equal(expected_splits, *GetOutput(1));
499 }
500
TEST_F(NgramKernelTest,TestOverlappingSinglyPaddedNGramsNoOutput)501 TEST_F(NgramKernelTest, TestOverlappingSinglyPaddedNGramsNoOutput) {
502 MakeOp("|", {5}, "LP", "RP", 1, false);
503 // Batch items are:
504 // 0: "a"
505 // 1: "b", "c", "d"
506 // 2: "e", "f"
507 AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
508 AddInputFromArray<int64_t>(TensorShape({4}), {0, 1, 4, 6});
509 TF_ASSERT_OK(RunOpKernel());
510
511 std::vector<tstring> expected_values({"LP|b|c|d|RP"});
512 std::vector<int64_t> expected_splits({0, 0, 1, 1});
513
514 assert_string_equal(expected_values, *GetOutput(0));
515 assert_int64_equal(expected_splits, *GetOutput(1));
516 }
517
TEST_F(NgramKernelTest,TestSinglyPaddedUnigrams)518 TEST_F(NgramKernelTest, TestSinglyPaddedUnigrams) {
519 MakeOp("|", {1}, "LP", "RP", 1, false);
520 // Batch items are:
521 // 0: "a", "b", "c", "d"
522 // 1: "e", "f"
523 AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
524 AddInputFromArray<int64_t>(TensorShape({3}), {0, 4, 6});
525 TF_ASSERT_OK(RunOpKernel());
526
527 std::vector<tstring> expected_values({"a", "b", "c", "d", "e", "f"});
528 std::vector<int64_t> expected_splits({0, 4, 6});
529
530 assert_string_equal(expected_values, *GetOutput(0));
531 assert_int64_equal(expected_splits, *GetOutput(1));
532 }
533
TEST_F(NgramKernelTest,TestEmptyInput)534 TEST_F(NgramKernelTest, TestEmptyInput) {
535 MakeOp("|", {1}, "LP", "RP", 3, false);
536 AddInputFromArray<tstring>(TensorShape({0}), {});
537 AddInputFromArray<int64_t>(TensorShape({0}), {});
538 TF_ASSERT_OK(RunOpKernel());
539
540 std::vector<tstring> expected_values({});
541 std::vector<int64_t> expected_splits({});
542
543 assert_string_equal(expected_values, *GetOutput(0));
544 assert_int64_equal(expected_splits, *GetOutput(1));
545 }
546
TEST_F(NgramKernelTest,TestNoTokens)547 TEST_F(NgramKernelTest, TestNoTokens) {
548 MakeOp("|", {3}, "L", "R", -1, false);
549 // Batch items are:
550 // 0:
551 // 1: "a"
552 AddInputFromArray<tstring>(TensorShape({1}), {"a"});
553 AddInputFromArray<int64_t>(TensorShape({3}), {0, 0, 1});
554 TF_ASSERT_OK(RunOpKernel());
555
556 std::vector<tstring> expected_values(
557 {"L|L|R", "L|R|R", // no input in first split
558 "L|L|a", "L|a|R", "a|R|R"}); // second split
559 std::vector<int64_t> expected_splits({0, 2, 5});
560
561 assert_string_equal(expected_values, *GetOutput(0));
562 assert_int64_equal(expected_splits, *GetOutput(1));
563 }
564
TEST_F(NgramKernelTest,TestNoTokensNoPad)565 TEST_F(NgramKernelTest, TestNoTokensNoPad) {
566 MakeOp("|", {3}, "", "", 0, false);
567 // Batch items are:
568 // 0:
569 // 1: "a"
570 AddInputFromArray<tstring>(TensorShape({1}), {"a"});
571 AddInputFromArray<int64_t>(TensorShape({3}), {0, 0, 1});
572 TF_ASSERT_OK(RunOpKernel());
573
574 std::vector<tstring> expected_values({});
575 std::vector<int64_t> expected_splits({0, 0, 0});
576
577 assert_string_equal(expected_values, *GetOutput(0));
578 assert_int64_equal(expected_splits, *GetOutput(1));
579 }
580
TEST_F(NgramKernelTest,TestNoTokens)581 TEST_F(NgramKernelTest, TestNoTokens) {
582 MakeOp("|", {3}, "L", "R", -1, false);
583 // Batch items are:
584 // 0:
585 // 1: "a"
586 AddInputFromArray<tstring>(TensorShape({1}), {"a"});
587 AddInputFromArray<int64>(TensorShape({3}), {0, 0, 1});
588 TF_ASSERT_OK(RunOpKernel());
589
590 std::vector<tstring> expected_values(
591 {"L|L|R", "L|R|R", // no input in first split
592 "L|L|a", "L|a|R", "a|R|R"}); // second split
593 std::vector<int64> expected_splits({0, 2, 5});
594
595 assert_string_equal(expected_values, *GetOutput(0));
596 assert_int64_equal(expected_splits, *GetOutput(1));
597 }
598
TEST_F(NgramKernelTest,TestNoTokensNoPad)599 TEST_F(NgramKernelTest, TestNoTokensNoPad) {
600 MakeOp("|", {3}, "", "", 0, false);
601 // Batch items are:
602 // 0:
603 // 1: "a"
604 AddInputFromArray<tstring>(TensorShape({1}), {"a"});
605 AddInputFromArray<int64>(TensorShape({3}), {0, 0, 1});
606 TF_ASSERT_OK(RunOpKernel());
607
608 std::vector<tstring> expected_values({});
609 std::vector<int64> expected_splits({0, 0, 0});
610
611 assert_string_equal(expected_values, *GetOutput(0));
612 assert_int64_equal(expected_splits, *GetOutput(1));
613 }
614
TEST_F(NgramKernelTest,ShapeFn)615 TEST_F(NgramKernelTest, ShapeFn) {
616 ShapeInferenceTestOp op("StringNGrams");
617 INFER_OK(op, "?;?", "[?];[?]");
618 INFER_OK(op, "[1];?", "[?];[?]");
619 INFER_OK(op, "[1];[2]", "[?];in1");
620 INFER_ERROR("Shape must be rank 1 but is rank 0", op, "[];?");
621 INFER_ERROR("Shape must be rank 1 but is rank 0", op, "?;[]");
622 }
623
624 } // namespace text
625 } // namespace tensorflow
626