xref: /aosp_15_r20/external/rappor/analysis/R/unknowns_test.R (revision 2abb31345f6c95944768b5222a9a5ed3fc68cc00)
1# Copyright 2014 Google Inc. All rights reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7#     http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14
15# Author: [email protected] (Giulia Fanti)
16#
17# Tests the unknown unknowns dictionary estimation functions.
18#     There are two main components involved in estimating this unknown
19#     distribution:
20#          a) Find the pairwise ngrams that co-occur often.
21#          b) Determine which full strings are consisted with all pairwise
22#             relations.
23#
24#     TestEstimateDictionary() tests the full pipeline, including parts (a)
25#         and (b).
26#     TestFindFeasibleStrings() tests only part (b).
27#     Both tests generate their own data.
28
29library(parallel)
30source("analysis/R/encode.R")
31source("analysis/R/decode.R")
32source("analysis/R/simulation.R")
33source("analysis/R/association.R")
34source("analysis/R/decode_ngrams.R")
35source("analysis/R/ngrams_simulation.R")
36alphabet <- letters
37options(warn = -1)
38
39GeneratePopulation <- function(N, num_strs, str_len = 10,
40                               distribution = NULL) {
41  # Generates a /deterministic/ string for each individual in the
42  #     population from distribution.
43  #
44  # Args:
45  #   N: Number of individuals in the population
46  #   num_strs: Number of strings from which to draw strings
47  #   str_len: Length of each string
48  #   distribution: Just here for compatibility with original
49  #       GeneratePopulation function in ngrams_simulation.R
50  #
51  # Returns:
52  #   Vector of strings for each individual in the population
53
54  strs <- sapply(1:num_strs, function(i) {
55    paste0(alphabet[(str_len * (i - 1) + 1):(str_len * i)], collapse = "")
56  })
57
58  # Uniform distribution
59  prob <- rep(1 / num_strs, num_strs)
60  sample(strs, N, replace = TRUE, prob = prob)
61}
62
63TestEstimateDictionary <- function() {
64  # Tests that the algorithm without noise recovers a uniform
65  #     string population correctly.
66
67  # Compute the strings from measuring only 2 ngrams
68  N <- 100
69  str_len <- 6
70  ngram_size <- 2
71  num_ngrams <- str_len / ngram_size
72  num_strs <- 1
73
74  params <- list(k = 128, h = 4, m = 2, p = 0, q = 1, f = 0)
75
76  ngram_params <- list(ngram_size = ngram_size, num_ngrams = num_ngrams,
77                       num_ngrams_collected = 2)
78
79  sim <- SimulateNGrams(N, ngram_params, str_len, num_strs = num_strs,
80                        alphabet, params, distribution = 3)
81
82  res <- EstimateDictionary(sim, N, ngram_params, params)
83
84  # Check that the correct strings are found
85  if (num_strs == 1) {
86    checkTrue(res$found_candidates == sort(unique(sim$strs)))
87  } else {
88    checkTrue(all.equal(res$found_candidates, sort(unique(sim$strs))))
89  }
90}
91
92TestFindFeasibleStrings <- function() {
93  # Tests that FindPairwiseCandidates weeds out false positives.
94  #     We test this by adding false positives to the pairwise estimates.
95  N <- 100
96  str_len <- 6
97  ngram_size <- 2
98  num_ngrams <- str_len / ngram_size
99  num_strs <- 2
100
101  params <- list(k = 128, h = 4, m = 2, p = 0, q = 1, f = 0)
102
103  ngram_params <- list(ngram_size = ngram_size, num_ngrams = num_ngrams,
104                       num_ngrams_collected = 2)
105
106  sim <- SimulateNGrams(N, ngram_params, str_len, num_strs = num_strs,
107                        alphabet, params)
108
109  pairwise_candidates <- FindPairwiseCandidates(sim, N, ngram_params,
110                                                params)$candidate_strs
111  cat("Found the pairwise candidates. \n")
112
113  pairwise_candidates[[1]] <- rbind(pairwise_candidates[[1]], c("ab", "le"))
114
115  if (is.null(pairwise_candidates)) {
116    return (FALSE)
117  }
118
119  conn <- file('graph.txt', 'w+')
120  WriteKPartiteGraph(conn,
121                     pairwise_candidates,
122                     sim$pairings,
123                     ngram_params$num_ngrams,
124                     ngram_params$ngram_size)
125
126  close(conn)
127  cat("Wrote graph.txt\n")
128
129  found_candidates <- FindFeasibleStrings(pairwise_candidates,
130                                          sim$pairings,
131                                          ngram_params$num_ngrams,
132                                          ngram_params$ngram_size)
133  # Check that the correct strings are found
134  if (num_strs == 1) {
135    checkTrue(found_candidates == sort(unique(sim$strs)))
136  } else {
137    checkTrue(all.equal(found_candidates, sort(unique(sim$strs))))
138  }
139}
140