xref: /aosp_15_r20/external/rappor/analysis/R/decode_ngrams.R (revision 2abb31345f6c95944768b5222a9a5ed3fc68cc00)
1*2abb3134SXin Li# Copyright 2014 Google Inc. All rights reserved.
2*2abb3134SXin Li#
3*2abb3134SXin Li# Licensed under the Apache License, Version 2.0 (the "License");
4*2abb3134SXin Li# you may not use this file except in compliance with the License.
5*2abb3134SXin Li# You may obtain a copy of the License at
6*2abb3134SXin Li#
7*2abb3134SXin Li#     http://www.apache.org/licenses/LICENSE-2.0
8*2abb3134SXin Li#
9*2abb3134SXin Li# Unless required by applicable law or agreed to in writing, software
10*2abb3134SXin Li# distributed under the License is distributed on an "AS IS" BASIS,
11*2abb3134SXin Li# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12*2abb3134SXin Li# See the License for the specific language governing permissions and
13*2abb3134SXin Li# limitations under the License.
14*2abb3134SXin Li
15*2abb3134SXin Li#
16*2abb3134SXin Li# This file has functions that aid in the estimation of a distribution when the
17*2abb3134SXin Li#     dictionary is unknown. There are functions for estimating pairwise joint
18*2abb3134SXin Li#     ngram distributions, pruning out false positives, and combining the two
19*2abb3134SXin Li#     steps.
20*2abb3134SXin Li
21*2abb3134SXin LiFindPairwiseCandidates <- function(report_data, N, ngram_params, params) {
22*2abb3134SXin Li  # Finds the pairwise most likely ngrams.
23*2abb3134SXin Li  #
24*2abb3134SXin Li  # Args:
25*2abb3134SXin Li  #   report_data: Object containing data relevant to reports:
26*2abb3134SXin Li  #       $inds: The indices of reports collected using various pairs
27*2abb3134SXin Li  #       $cohorts: The cohort of each report
28*2abb3134SXin Li  #       $map: The map used for all the ngrams
29*2abb3134SXin Li  #       $reports: The reports used for each ngram and full string
30*2abb3134SXin Li  #   N: Number of reports collected
31*2abb3134SXin Li  #   ngram_params: Parameters related to ngram size
32*2abb3134SXin Li  #   params: Parameter list.
33*2abb3134SXin Li  #
34*2abb3134SXin Li  # Returns:
35*2abb3134SXin Li  #   List: list of matrices, list of pairwise distributions.
36*2abb3134SXin Li
37*2abb3134SXin Li  inds <- report_data$inds
38*2abb3134SXin Li  cohorts <- report_data$cohorts
39*2abb3134SXin Li  num_ngrams_collected <- ngram_params$num_ngrams_collected
40*2abb3134SXin Li  map <- report_data$map
41*2abb3134SXin Li  reports <- report_data$reports
42*2abb3134SXin Li
43*2abb3134SXin Li  # Cycle over all the unique pairs of ngrams being collected
44*2abb3134SXin Li  found_candidates <- list()
45*2abb3134SXin Li
46*2abb3134SXin Li  # Generate the map list to be used for all ngrams
47*2abb3134SXin Li  maps <- lapply(1:num_ngrams_collected, function(x) map)
48*2abb3134SXin Li  num_candidate_ngrams <- length(inds)
49*2abb3134SXin Li
50*2abb3134SXin Li  .ComputeDist <- function(i, inds, cohorts, reports, maps, params,
51*2abb3134SXin Li                           num_ngrams_collected) {
52*2abb3134SXin Li    library(glmnet)
53*2abb3134SXin Li    ind <- inds[[i]]
54*2abb3134SXin Li    cohort_subset <- lapply(1:num_ngrams_collected, function(x)
55*2abb3134SXin Li                            cohorts[ind])
56*2abb3134SXin Li    report_subset <- reports[[i]]
57*2abb3134SXin Li    new_dist <- ComputeDistributionEM(report_subset,
58*2abb3134SXin Li                                      cohort_subset,
59*2abb3134SXin Li                                      maps, ignore_other = FALSE,
60*2abb3134SXin Li                                      params = params, estimate_var = FALSE)
61*2abb3134SXin Li    new_dist
62*2abb3134SXin Li  }
63*2abb3134SXin Li
64*2abb3134SXin Li  # Compute the pairwise distributions (could be parallelized)
65*2abb3134SXin Li  dists <- lapply(seq(num_candidate_ngrams), function(i)
66*2abb3134SXin Li                  .ComputeDist(i, inds, cohorts, reports, maps,
67*2abb3134SXin Li                               params, num_ngrams_collected))
68*2abb3134SXin Li
69*2abb3134SXin Li  dists_null <- sapply(dists, function(x) is.null(x))
70*2abb3134SXin Li  if (any(dists_null)) {
71*2abb3134SXin Li    return (list(found_candidates = list(), dists = dists))
72*2abb3134SXin Li  }
73*2abb3134SXin Li  cat("Found the pairwise ngram distributions.\n")
74*2abb3134SXin Li
75*2abb3134SXin Li  # Find the threshold for choosing "significant" ngram pairs
76*2abb3134SXin Li  f <- params$f; q <- params$q; p <- params$p
77*2abb3134SXin Li  q2 <- .5 * f * (p + q) + (1 - f) * q
78*2abb3134SXin Li  p2 <- .5 * f * (p + q) + (1 - f) * p
79*2abb3134SXin Li  std_dev_counts <- sqrt(p2 * (1 - p2) * N) / (q2 - p2)
80*2abb3134SXin Li  (threshold <- std_dev_counts / N)
81*2abb3134SXin Li  threshold <- 0.04
82*2abb3134SXin Li
83*2abb3134SXin Li  # Filter joints to remove infrequently co-occurring ngrams.
84*2abb3134SXin Li  candidate_strs <- lapply(1:num_candidate_ngrams, function(i) {
85*2abb3134SXin Li    fit <- dists[[i]]$fit
86*2abb3134SXin Li    edges <- which(fit > threshold, arr.ind = TRUE, FALSE)
87*2abb3134SXin Li
88*2abb3134SXin Li    # Recover the list of strings that seem significant
89*2abb3134SXin Li    found_candidates <- sapply(1:ncol(edges), function(x) {
90*2abb3134SXin Li      chunks <- sapply(edges[, x],
91*2abb3134SXin Li                       function(j) dimnames(fit)[[x]][j])
92*2abb3134SXin Li      chunks
93*2abb3134SXin Li    })
94*2abb3134SXin Li    # sapply returns either "character" vector (for n=1) or a matrix.  Convert
95*2abb3134SXin Li    # it to a matrix.  This can be seen as follows:
96*2abb3134SXin Li    #
97*2abb3134SXin Li    # > class(sapply(1:5, function(x) "a"))
98*2abb3134SXin Li    # [1] "character"
99*2abb3134SXin Li    # > class(sapply(1:5, function(x) c("a", "b")))
100*2abb3134SXin Li    # [1] "matrix"
101*2abb3134SXin Li    found_candidates <- rbind(found_candidates)
102*2abb3134SXin Li
103*2abb3134SXin Li    # Remove the "others"
104*2abb3134SXin Li    others <- which(found_candidates == "Other")
105*2abb3134SXin Li    if (length(others) > 0) {
106*2abb3134SXin Li      other <- which(found_candidates == "Other", arr.ind = TRUE)[, 1]
107*2abb3134SXin Li      # drop = FALSE necessary to keep it a matrix
108*2abb3134SXin Li      found_candidates <- found_candidates[-other, , drop = FALSE]
109*2abb3134SXin Li    }
110*2abb3134SXin Li
111*2abb3134SXin Li    found_candidates
112*2abb3134SXin Li  })
113*2abb3134SXin Li  if (any(lapply(found_candidates, function(x) length(x)) == 0)) {
114*2abb3134SXin Li    return (NULL)
115*2abb3134SXin Li  }
116*2abb3134SXin Li
117*2abb3134SXin Li  list(candidate_strs = candidate_strs, dists = dists)
118*2abb3134SXin Li}
119*2abb3134SXin Li
120*2abb3134SXin LiFindFeasibleStrings <- function(found_candidates, pairings, num_ngrams,
121*2abb3134SXin Li                                ngram_size) {
122*2abb3134SXin Li  # Uses the list of strings found by the pairwise comparisons to build
123*2abb3134SXin Li  #     a list of full feasible strings. This relies on the iterative,
124*2abb3134SXin Li  #     graph-based approach.
125*2abb3134SXin Li  #
126*2abb3134SXin Li  # Args:
127*2abb3134SXin Li  #   found_candidates: list of candidates found by each pairwise decoding
128*2abb3134SXin Li  #   pairings: Matrix of size 2x(num_ngrams choose 2) listing all the
129*2abb3134SXin Li  #       ngram position pairings.
130*2abb3134SXin Li  #   num_ngrams: The total number of ngrams per word.
131*2abb3134SXin Li  #   ngram_size: Number of characters per ngram
132*2abb3134SXin Li  #
133*2abb3134SXin Li  # Returns:
134*2abb3134SXin Li  #   List of full string candidates.
135*2abb3134SXin Li
136*2abb3134SXin Li  # Which ngram pairs are adjacent, i.e. of the form (i,i+1)
137*2abb3134SXin Li  adjacent <- sapply(seq(num_ngrams - 1), function(x) {
138*2abb3134SXin Li    c(1 + (x - 1) * ngram_size, x * ngram_size + 1)
139*2abb3134SXin Li  })
140*2abb3134SXin Li
141*2abb3134SXin Li  adjacent_pairs <- apply(adjacent, 2, function(x) {
142*2abb3134SXin Li    which(apply(pairings, 1, function(y) identical(y, x)))
143*2abb3134SXin Li  })
144*2abb3134SXin Li
145*2abb3134SXin Li  # The first set of candidates are ngrams found in positions 1 and 2
146*2abb3134SXin Li  active_cands <- found_candidates[[adjacent_pairs[1]]]
147*2abb3134SXin Li  if (class(active_cands) == "list") {
148*2abb3134SXin Li    return (list())
149*2abb3134SXin Li  } else {
150*2abb3134SXin Li    active_cands <- as.data.frame(active_cands)
151*2abb3134SXin Li  }
152*2abb3134SXin Li
153*2abb3134SXin Li  # Now check successive ngrams to find consistent combinations
154*2abb3134SXin Li  #     i.e. after ngrams 1-2, check 2-3, 3-4, 4-5, etc.
155*2abb3134SXin Li  for (i in 2:length(adjacent_pairs)) {
156*2abb3134SXin Li    if (nrow(active_cands) == 0) {
157*2abb3134SXin Li      return (list())
158*2abb3134SXin Li    }
159*2abb3134SXin Li    new_cands <- found_candidates[[adjacent_pairs[i]]]
160*2abb3134SXin Li    new_cands <- as.data.frame(new_cands)
161*2abb3134SXin Li    # Builds the set of possible candidates based only on ascending
162*2abb3134SXin Li    #     candidate pairs
163*2abb3134SXin Li    active_cands <- BuildCandidates(active_cands, new_cands)
164*2abb3134SXin Li  }
165*2abb3134SXin Li
166*2abb3134SXin Li  if (nrow(active_cands) == 0) {
167*2abb3134SXin Li    return (list())
168*2abb3134SXin Li  }
169*2abb3134SXin Li  # Now refine these candidates using non-adjacent bigrams
170*2abb3134SXin Li  remaining <- (1:(num_ngrams * (num_ngrams - 1) / 2))[-c(1, adjacent_pairs)]
171*2abb3134SXin Li  # For each non-adjacent pair, make sure that all the candidates are
172*2abb3134SXin Li  #     consistent (in this phase, candidates can ONLY be eliminated)
173*2abb3134SXin Li
174*2abb3134SXin Li  for (i in remaining) {
175*2abb3134SXin Li    new_cands <- found_candidates[[i]]
176*2abb3134SXin Li    new_cands <- as.data.frame(new_cands)
177*2abb3134SXin Li    # Prune out all candidates that do not agree with new_cands
178*2abb3134SXin Li    active_cands <- PruneCandidates(active_cands, pairings[i, ],
179*2abb3134SXin Li                                    ngram_size,
180*2abb3134SXin Li                                    new_cands = new_cands)
181*2abb3134SXin Li  }
182*2abb3134SXin Li  # Consolidate the string ngrams into a full string representation
183*2abb3134SXin Li  if (length(active_cands) > 0) {
184*2abb3134SXin Li    active_cands <- sort(apply(active_cands, 1,
185*2abb3134SXin Li                               function(x) paste0(x, collapse = "")))
186*2abb3134SXin Li  }
187*2abb3134SXin Li  unname(active_cands)
188*2abb3134SXin Li}
189*2abb3134SXin Li
190*2abb3134SXin LiBuildCandidates <- function(active_cands, new_cands) {
191*2abb3134SXin Li  # Takes in a data frame where each row is a valid sequence of ngrams
192*2abb3134SXin Li  #     checks which of the new_cands ngram pairs are consistent with
193*2abb3134SXin Li  #     the original active_cands ngram sequence.
194*2abb3134SXin Li  #
195*2abb3134SXin Li  # Args:
196*2abb3134SXin Li  #   active_cands: data frame of ngram sequence candidates (1 candidate
197*2abb3134SXin Li  #       sequence per row)
198*2abb3134SXin Li  #   new_cands: An rx2 data frame with a new list of candidate ngram
199*2abb3134SXin Li  #       pairs that might fit in with the previous list of candidates
200*2abb3134SXin Li  #
201*2abb3134SXin Li  # Returns:
202*2abb3134SXin Li  #   Updated active_cands, with another column if valid extensions are
203*2abb3134SXin Li  #       found.
204*2abb3134SXin Li
205*2abb3134SXin Li  # Get the trailing ngrams from the current candidates
206*2abb3134SXin Li  to_check <- as.vector(tail(t(active_cands), n = 1))
207*2abb3134SXin Li  # Check which of the elements in to_check are leading ngrams among the
208*2abb3134SXin Li  #     new candidates
209*2abb3134SXin Li  present <- sapply(to_check, function(x) any(x == new_cands))
210*2abb3134SXin Li  # Remove the strings that are not represented among the new candidates
211*2abb3134SXin Li  to_check <- to_check[present]
212*2abb3134SXin Li  # Now insert the new candidates where they belong
213*2abb3134SXin Li  active_cands <- active_cands[present, , drop = FALSE]
214*2abb3134SXin Li  active_cands <- cbind(active_cands, col = NA)
215*2abb3134SXin Li  num_cands <- nrow(active_cands)
216*2abb3134SXin Li  hit_list <- c()
217*2abb3134SXin Li  for (j in 1:num_cands) {
218*2abb3134SXin Li    inds <- which(new_cands[, 1] == to_check[j])
219*2abb3134SXin Li    if (length(inds) == 0) {
220*2abb3134SXin Li      hit_list <- c(hit_list, j)
221*2abb3134SXin Li      next
222*2abb3134SXin Li    }
223*2abb3134SXin Li    # If there are multiple candidates fitting with an ngram, include
224*2abb3134SXin Li    #     each /full/ string as a candidate
225*2abb3134SXin Li    extra <- length(inds) - 1
226*2abb3134SXin Li    if (extra > 0) {
227*2abb3134SXin Li      rep_inds <- c(j, (new_num_cands + 1):(new_num_cands + extra))
228*2abb3134SXin Li      to_paste <- active_cands[j, ]
229*2abb3134SXin Li      # Add the new candidates to the bottom
230*2abb3134SXin Li      for (p in 1:extra) {
231*2abb3134SXin Li        active_cands <- rbind(active_cands, to_paste)
232*2abb3134SXin Li      }
233*2abb3134SXin Li    } else {
234*2abb3134SXin Li      rep_inds <- c(j)
235*2abb3134SXin Li    }
236*2abb3134SXin Li    active_cands[rep_inds, ncol(active_cands)] <-
237*2abb3134SXin Li        as.vector(new_cands[inds, 2])
238*2abb3134SXin Li    new_num_cands <- nrow(active_cands)
239*2abb3134SXin Li  }
240*2abb3134SXin Li  # If there were some false candidates in the original set, remove them
241*2abb3134SXin Li  if (length(hit_list) > 0) {
242*2abb3134SXin Li    active_cands <- active_cands[-hit_list, , drop = FALSE]
243*2abb3134SXin Li  }
244*2abb3134SXin Li  active_cands
245*2abb3134SXin Li}
246*2abb3134SXin Li
247*2abb3134SXin LiPruneCandidates <- function(active_cands, pairing, ngram_size, new_cands) {
248*2abb3134SXin Li  # Takes in a data frame where each row is a valid sequence of ngrams
249*2abb3134SXin Li  #     checks which of the new_cands ngram pairs are consistent with
250*2abb3134SXin Li  #     the original active_cands ngram sequence. This can ONLY remove
251*2abb3134SXin Li  #     candidates presented in active_cands.
252*2abb3134SXin Li  #
253*2abb3134SXin Li  # Args:
254*2abb3134SXin Li  #   active_cands: data frame of ngram sequence candidates (1 candidate
255*2abb3134SXin Li  #       sequence per row)
256*2abb3134SXin Li  #   pairing: A length-2 list storing which two ngrams are measured
257*2abb3134SXin Li  #   ngram_size: Number of characters per ngram
258*2abb3134SXin Li  #   new_cands: An rx2 data frame with a new list of candidate ngram
259*2abb3134SXin Li  #       pairs that might fit in with the previous list of candidates
260*2abb3134SXin Li  #
261*2abb3134SXin Li  # Returns:
262*2abb3134SXin Li  #   Updated active_cands, with a reduced number of rows.
263*2abb3134SXin Li
264*2abb3134SXin Li  # Convert the pairing to an ngram index
265*2abb3134SXin Li  cols <- sapply(pairing, function(x) (x - 1) / ngram_size + 1)
266*2abb3134SXin Li
267*2abb3134SXin Li  cands_to_check <- active_cands[, cols, drop = FALSE]
268*2abb3134SXin Li  # Find the candidates that are inconsistent with the new data
269*2abb3134SXin Li  hit_list <- sapply(1:nrow(cands_to_check), function(j) {
270*2abb3134SXin Li    to_kill <- FALSE
271*2abb3134SXin Li    if (nrow(new_cands) == 0) {
272*2abb3134SXin Li      return (TRUE)
273*2abb3134SXin Li    }
274*2abb3134SXin Li    if (!any(apply(new_cands, 1, function(x)
275*2abb3134SXin Li                   all(cands_to_check[j, , drop = FALSE] == x)))) {
276*2abb3134SXin Li      to_kill <- TRUE
277*2abb3134SXin Li    }
278*2abb3134SXin Li    to_kill
279*2abb3134SXin Li  })
280*2abb3134SXin Li
281*2abb3134SXin Li  # Determine which rows are false positives
282*2abb3134SXin Li  hit_indices <- which(hit_list)
283*2abb3134SXin Li  # Remove the false positives
284*2abb3134SXin Li  if (length(hit_indices) > 0) {
285*2abb3134SXin Li    active_cands <- active_cands[-hit_indices, ]
286*2abb3134SXin Li  }
287*2abb3134SXin Li  active_cands
288*2abb3134SXin Li}
289*2abb3134SXin Li
290*2abb3134SXin LiEstimateDictionary <- function(report_data, N, ngram_params, params) {
291*2abb3134SXin Li  # Takes in a list of report data and returns a list of string
292*2abb3134SXin Li  #     estimates of the dictionary.
293*2abb3134SXin Li  #
294*2abb3134SXin Li  # Args:
295*2abb3134SXin Li  #     report_data: Object containing data relevant to reports:
296*2abb3134SXin Li  #         $inds: The indices of reports collected using various pairs
297*2abb3134SXin Li  #         $cohorts: The cohort of each report
298*2abb3134SXin Li  #         $map: THe map used for all the ngrams
299*2abb3134SXin Li  #         $reports: The reports used for each ngram and full string
300*2abb3134SXin Li  #   N: the number of individuals sending reports
301*2abb3134SXin Li  #   ngram_params: Parameters related to ngram length, etc
302*2abb3134SXin Li  #   params: Parameter vector with RAPPOR noise levels, cohorts, etc
303*2abb3134SXin Li  #
304*2abb3134SXin Li  # Returns:
305*2abb3134SXin Li  #   List: list of found candidates, list of pairwise candidates
306*2abb3134SXin Li
307*2abb3134SXin Li  pairwise_candidates <- FindPairwiseCandidates(report_data, N,
308*2abb3134SXin Li                                                ngram_params,
309*2abb3134SXin Li                                                params)$candidate_strs
310*2abb3134SXin Li  cat("Found the pairwise candidates. \n")
311*2abb3134SXin Li  if (is.null(pairwise_candidates)) {
312*2abb3134SXin Li    return (list())
313*2abb3134SXin Li  }
314*2abb3134SXin Li  found_candidates <- FindFeasibleStrings(pairwise_candidates,
315*2abb3134SXin Li                                          report_data$pairings,
316*2abb3134SXin Li                                          ngram_params$num_ngrams,
317*2abb3134SXin Li                                          ngram_params$ngram_size)
318*2abb3134SXin Li  cat("Found all the candidates. \n")
319*2abb3134SXin Li  list(found_candidates = found_candidates,
320*2abb3134SXin Li       pairwise_candidates = pairwise_candidates)
321*2abb3134SXin Li}
322*2abb3134SXin Li
323*2abb3134SXin LiWriteKPartiteGraph <- function(conn, pairwise_candidates, pairings, num_ngrams,
324*2abb3134SXin Li                               ngram_size) {
325*2abb3134SXin Li  # Args:
326*2abb3134SXin Li  #  conn: R connection to write to.  Should be opened with mode w+.
327*2abb3134SXin Li  #  pairwise_candidates: list of matrices.  Each matrix represents a subgraph;
328*2abb3134SXin Li  #    it contains the edges between partitions i and j, so there are (k choose
329*2abb3134SXin Li  #    2) matrices.  Each matrix has dimension 2 x E, where E is the number of
330*2abb3134SXin Li  #    edges.
331*2abb3134SXin Li  #  pairings: 2 x (k choose 2) matrix of character positions.  Each row
332*2abb3134SXin Li  #    corresponds to a subgraph; it has 1-based character index of partitions
333*2abb3134SXin Li  #    i and j.
334*2abb3134SXin Li  #  num_ngrams: length of pairwise_candidates, or the number of partitions in
335*2abb3134SXin Li  #    the k-partite graph
336*2abb3134SXin Li
337*2abb3134SXin Li  # File Format:
338*2abb3134SXin Li  #
339*2abb3134SXin Li  # num_partitions 3
340*2abb3134SXin Li  # ngram_size 2
341*2abb3134SXin Li  # 0.ab 1.cd
342*2abb3134SXin Li  # 0.ab 2.ef
343*2abb3134SXin Li  #
344*2abb3134SXin Li  # The first line specifies the number of partitions (k).
345*2abb3134SXin Li  # The remaining lines are edges, where each node is <partition>.<bigram>.
346*2abb3134SXin Li  #
347*2abb3134SXin Li  # Partitions are numbered from 0.  The partition of the left node will be
348*2abb3134SXin Li  # less than the partition of the right node.
349*2abb3134SXin Li
350*2abb3134SXin Li  # First two lines are metadata
351*2abb3134SXin Li  cat(sprintf('num_partitions %d\n', num_ngrams), file = conn)
352*2abb3134SXin Li  cat(sprintf('ngram_size %d\n', ngram_size), file = conn)
353*2abb3134SXin Li
354*2abb3134SXin Li  for (i in 1:length(pairwise_candidates)) {
355*2abb3134SXin Li    # The two pairwise_candidates for this subgraph.
356*2abb3134SXin Li    # Turn 1-based character positions into 0-based partition numbers,
357*2abb3134SXin Li    # e.g. (3, 5) -> (1, 2)
358*2abb3134SXin Li
359*2abb3134SXin Li    pos1 <- pairings[[i, 1]]
360*2abb3134SXin Li    pos2 <- pairings[[i, 2]]
361*2abb3134SXin Li    part1 <- (pos1 - 1) / ngram_size
362*2abb3134SXin Li    part2 <- (pos2 - 1) / ngram_size
363*2abb3134SXin Li    cat(sprintf("Writing partition (%d, %d)\n", part1, part2))
364*2abb3134SXin Li
365*2abb3134SXin Li    p <- pairwise_candidates[[i]]
366*2abb3134SXin Li    # each row is an edge
367*2abb3134SXin Li    for (j in 1:nrow(p)) {
368*2abb3134SXin Li      n1 <- p[[j, 1]]
369*2abb3134SXin Li      n2 <- p[[j, 2]]
370*2abb3134SXin Li      line <- sprintf('edge %d.%s %d.%s\n', part1, n1, part2, n2)
371*2abb3134SXin Li      # NOTE: It would be faster to preallocate 'lines', but we would have to
372*2abb3134SXin Li      # make a two passes through pairwise_candidates.
373*2abb3134SXin Li      cat(line, file = conn)
374*2abb3134SXin Li    }
375*2abb3134SXin Li  }
376*2abb3134SXin Li}
377*2abb3134SXin Li
378