xref: /aosp_15_r20/external/rappor/analysis/R/unknowns_test.R (revision 2abb31345f6c95944768b5222a9a5ed3fc68cc00)
1*2abb3134SXin Li# Copyright 2014 Google Inc. All rights reserved.
2*2abb3134SXin Li#
3*2abb3134SXin Li# Licensed under the Apache License, Version 2.0 (the "License");
4*2abb3134SXin Li# you may not use this file except in compliance with the License.
5*2abb3134SXin Li# You may obtain a copy of the License at
6*2abb3134SXin Li#
7*2abb3134SXin Li#     http://www.apache.org/licenses/LICENSE-2.0
8*2abb3134SXin Li#
9*2abb3134SXin Li# Unless required by applicable law or agreed to in writing, software
10*2abb3134SXin Li# distributed under the License is distributed on an "AS IS" BASIS,
11*2abb3134SXin Li# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12*2abb3134SXin Li# See the License for the specific language governing permissions and
13*2abb3134SXin Li# limitations under the License.
14*2abb3134SXin Li
15*2abb3134SXin Li# Author: [email protected] (Giulia Fanti)
16*2abb3134SXin Li#
17*2abb3134SXin Li# Tests the unknown unknowns dictionary estimation functions.
18*2abb3134SXin Li#     There are two main components involved in estimating this unknown
19*2abb3134SXin Li#     distribution:
20*2abb3134SXin Li#          a) Find the pairwise ngrams that co-occur often.
21*2abb3134SXin Li#          b) Determine which full strings are consisted with all pairwise
22*2abb3134SXin Li#             relations.
23*2abb3134SXin Li#
24*2abb3134SXin Li#     TestEstimateDictionary() tests the full pipeline, including parts (a)
25*2abb3134SXin Li#         and (b).
26*2abb3134SXin Li#     TestFindFeasibleStrings() tests only part (b).
27*2abb3134SXin Li#     Both tests generate their own data.
28*2abb3134SXin Li
29*2abb3134SXin Lilibrary(parallel)
30*2abb3134SXin Lisource("analysis/R/encode.R")
31*2abb3134SXin Lisource("analysis/R/decode.R")
32*2abb3134SXin Lisource("analysis/R/simulation.R")
33*2abb3134SXin Lisource("analysis/R/association.R")
34*2abb3134SXin Lisource("analysis/R/decode_ngrams.R")
35*2abb3134SXin Lisource("analysis/R/ngrams_simulation.R")
36*2abb3134SXin Lialphabet <- letters
37*2abb3134SXin Lioptions(warn = -1)
38*2abb3134SXin Li
39*2abb3134SXin LiGeneratePopulation <- function(N, num_strs, str_len = 10,
40*2abb3134SXin Li                               distribution = NULL) {
41*2abb3134SXin Li  # Generates a /deterministic/ string for each individual in the
42*2abb3134SXin Li  #     population from distribution.
43*2abb3134SXin Li  #
44*2abb3134SXin Li  # Args:
45*2abb3134SXin Li  #   N: Number of individuals in the population
46*2abb3134SXin Li  #   num_strs: Number of strings from which to draw strings
47*2abb3134SXin Li  #   str_len: Length of each string
48*2abb3134SXin Li  #   distribution: Just here for compatibility with original
49*2abb3134SXin Li  #       GeneratePopulation function in ngrams_simulation.R
50*2abb3134SXin Li  #
51*2abb3134SXin Li  # Returns:
52*2abb3134SXin Li  #   Vector of strings for each individual in the population
53*2abb3134SXin Li
54*2abb3134SXin Li  strs <- sapply(1:num_strs, function(i) {
55*2abb3134SXin Li    paste0(alphabet[(str_len * (i - 1) + 1):(str_len * i)], collapse = "")
56*2abb3134SXin Li  })
57*2abb3134SXin Li
58*2abb3134SXin Li  # Uniform distribution
59*2abb3134SXin Li  prob <- rep(1 / num_strs, num_strs)
60*2abb3134SXin Li  sample(strs, N, replace = TRUE, prob = prob)
61*2abb3134SXin Li}
62*2abb3134SXin Li
63*2abb3134SXin LiTestEstimateDictionary <- function() {
64*2abb3134SXin Li  # Tests that the algorithm without noise recovers a uniform
65*2abb3134SXin Li  #     string population correctly.
66*2abb3134SXin Li
67*2abb3134SXin Li  # Compute the strings from measuring only 2 ngrams
68*2abb3134SXin Li  N <- 100
69*2abb3134SXin Li  str_len <- 6
70*2abb3134SXin Li  ngram_size <- 2
71*2abb3134SXin Li  num_ngrams <- str_len / ngram_size
72*2abb3134SXin Li  num_strs <- 1
73*2abb3134SXin Li
74*2abb3134SXin Li  params <- list(k = 128, h = 4, m = 2, p = 0, q = 1, f = 0)
75*2abb3134SXin Li
76*2abb3134SXin Li  ngram_params <- list(ngram_size = ngram_size, num_ngrams = num_ngrams,
77*2abb3134SXin Li                       num_ngrams_collected = 2)
78*2abb3134SXin Li
79*2abb3134SXin Li  sim <- SimulateNGrams(N, ngram_params, str_len, num_strs = num_strs,
80*2abb3134SXin Li                        alphabet, params, distribution = 3)
81*2abb3134SXin Li
82*2abb3134SXin Li  res <- EstimateDictionary(sim, N, ngram_params, params)
83*2abb3134SXin Li
84*2abb3134SXin Li  # Check that the correct strings are found
85*2abb3134SXin Li  if (num_strs == 1) {
86*2abb3134SXin Li    checkTrue(res$found_candidates == sort(unique(sim$strs)))
87*2abb3134SXin Li  } else {
88*2abb3134SXin Li    checkTrue(all.equal(res$found_candidates, sort(unique(sim$strs))))
89*2abb3134SXin Li  }
90*2abb3134SXin Li}
91*2abb3134SXin Li
92*2abb3134SXin LiTestFindFeasibleStrings <- function() {
93*2abb3134SXin Li  # Tests that FindPairwiseCandidates weeds out false positives.
94*2abb3134SXin Li  #     We test this by adding false positives to the pairwise estimates.
95*2abb3134SXin Li  N <- 100
96*2abb3134SXin Li  str_len <- 6
97*2abb3134SXin Li  ngram_size <- 2
98*2abb3134SXin Li  num_ngrams <- str_len / ngram_size
99*2abb3134SXin Li  num_strs <- 2
100*2abb3134SXin Li
101*2abb3134SXin Li  params <- list(k = 128, h = 4, m = 2, p = 0, q = 1, f = 0)
102*2abb3134SXin Li
103*2abb3134SXin Li  ngram_params <- list(ngram_size = ngram_size, num_ngrams = num_ngrams,
104*2abb3134SXin Li                       num_ngrams_collected = 2)
105*2abb3134SXin Li
106*2abb3134SXin Li  sim <- SimulateNGrams(N, ngram_params, str_len, num_strs = num_strs,
107*2abb3134SXin Li                        alphabet, params)
108*2abb3134SXin Li
109*2abb3134SXin Li  pairwise_candidates <- FindPairwiseCandidates(sim, N, ngram_params,
110*2abb3134SXin Li                                                params)$candidate_strs
111*2abb3134SXin Li  cat("Found the pairwise candidates. \n")
112*2abb3134SXin Li
113*2abb3134SXin Li  pairwise_candidates[[1]] <- rbind(pairwise_candidates[[1]], c("ab", "le"))
114*2abb3134SXin Li
115*2abb3134SXin Li  if (is.null(pairwise_candidates)) {
116*2abb3134SXin Li    return (FALSE)
117*2abb3134SXin Li  }
118*2abb3134SXin Li
119*2abb3134SXin Li  conn <- file('graph.txt', 'w+')
120*2abb3134SXin Li  WriteKPartiteGraph(conn,
121*2abb3134SXin Li                     pairwise_candidates,
122*2abb3134SXin Li                     sim$pairings,
123*2abb3134SXin Li                     ngram_params$num_ngrams,
124*2abb3134SXin Li                     ngram_params$ngram_size)
125*2abb3134SXin Li
126*2abb3134SXin Li  close(conn)
127*2abb3134SXin Li  cat("Wrote graph.txt\n")
128*2abb3134SXin Li
129*2abb3134SXin Li  found_candidates <- FindFeasibleStrings(pairwise_candidates,
130*2abb3134SXin Li                                          sim$pairings,
131*2abb3134SXin Li                                          ngram_params$num_ngrams,
132*2abb3134SXin Li                                          ngram_params$ngram_size)
133*2abb3134SXin Li  # Check that the correct strings are found
134*2abb3134SXin Li  if (num_strs == 1) {
135*2abb3134SXin Li    checkTrue(found_candidates == sort(unique(sim$strs)))
136*2abb3134SXin Li  } else {
137*2abb3134SXin Li    checkTrue(all.equal(found_candidates, sort(unique(sim$strs))))
138*2abb3134SXin Li  }
139*2abb3134SXin Li}
140