xref: /aosp_15_r20/external/rappor/analysis/R/ngrams_simulation.R (revision 2abb31345f6c95944768b5222a9a5ed3fc68cc00)
1*2abb3134SXin Li# Copyright 2014 Google Inc. All rights reserved.
2*2abb3134SXin Li#
3*2abb3134SXin Li# Licensed under the Apache License, Version 2.0 (the "License");
4*2abb3134SXin Li# you may not use this file except in compliance with the License.
5*2abb3134SXin Li# You may obtain a copy of the License at
6*2abb3134SXin Li#
7*2abb3134SXin Li#     http://www.apache.org/licenses/LICENSE-2.0
8*2abb3134SXin Li#
9*2abb3134SXin Li# Unless required by applicable law or agreed to in writing, software
10*2abb3134SXin Li# distributed under the License is distributed on an "AS IS" BASIS,
11*2abb3134SXin Li# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12*2abb3134SXin Li# See the License for the specific language governing permissions and
13*2abb3134SXin Li# limitations under the License.
14*2abb3134SXin Li
15*2abb3134SXin Li# Authors: [email protected] (Vasyl Pihur) and [email protected] (Giulia Fanti)
16*2abb3134SXin Li#
17*2abb3134SXin Li# Tools used to simulate sending partial ngrams to the server for estimating the
18*2abb3134SXin Li#     dictionary of terms over which we want to learn a distribution. This
19*2abb3134SXin Li#     mostly contains functions that aid in the generation of synthetic data.
20*2abb3134SXin Li
21*2abb3134SXin Lilibrary(RUnit)
22*2abb3134SXin Lilibrary(parallel)
23*2abb3134SXin Li
24*2abb3134SXin Lisource("analysis/R/encode.R")
25*2abb3134SXin Lisource("analysis/R/decode.R")
26*2abb3134SXin Lisource("analysis/R/simulation.R")
27*2abb3134SXin Lisource("analysis/R/association.R")
28*2abb3134SXin Lisource("analysis/R/decode_ngrams.R")
29*2abb3134SXin Li
30*2abb3134SXin Li# The alphabet is the set of all possible characters that will appear in a
31*2abb3134SXin Li#     string. Here we use the English alphabet, but one might want to include
32*2abb3134SXin Li#     numbers or punctuation marks.
33*2abb3134SXin Lialphabet <- letters
34*2abb3134SXin Li
35*2abb3134SXin LiGenerateCandidates <- function(alphabet, ngram_size = 2) {
36*2abb3134SXin Li  # Draws a random string for each individual in the
37*2abb3134SXin Li  #     population from distribution.
38*2abb3134SXin Li  #
39*2abb3134SXin Li  # Args:
40*2abb3134SXin Li  #   N: Number of individuals in the population
41*2abb3134SXin Li  #   num_strs: Number of strings from which to draw strings
42*2abb3134SXin Li  #   str_len: Length of each string
43*2abb3134SXin Li  #
44*2abb3134SXin Li  # Returns:
45*2abb3134SXin Li  #   Vector of strings for each individual in the population
46*2abb3134SXin Li
47*2abb3134SXin Li  cands <- do.call(expand.grid, lapply(seq(ngram_size), function(i) alphabet))
48*2abb3134SXin Li  apply(cands, 1, function(x) paste0(x, collapse = ""))
49*2abb3134SXin Li}
50*2abb3134SXin Li
51*2abb3134SXin LiGenerateString <- function(n) {
52*2abb3134SXin Li  # Generates a string of a given length from the alphabet.
53*2abb3134SXin Li  #
54*2abb3134SXin Li  # Args:
55*2abb3134SXin Li  #   n: Number of characters in the string
56*2abb3134SXin Li  #
57*2abb3134SXin Li  # Returns:
58*2abb3134SXin Li  #   String of length n
59*2abb3134SXin Li  paste0(sample(alphabet, n, replace = TRUE), collapse = "")
60*2abb3134SXin Li}
61*2abb3134SXin Li
62*2abb3134SXin LiGeneratePopulation <- function(N, num_strs, str_len = 10,
63*2abb3134SXin Li                               distribution = 1) {
64*2abb3134SXin Li  # Generates a string for each individual in the population from distribution.
65*2abb3134SXin Li  #
66*2abb3134SXin Li  # Args:
67*2abb3134SXin Li  #   N: Number of individuals in the population
68*2abb3134SXin Li  #   num_strs: Number of strings from which to draw strings
69*2abb3134SXin Li  #   str_len: Length of each string
70*2abb3134SXin Li  #   distribution: which type of distribution to use
71*2abb3134SXin Li  #     1: Zipfian
72*2abb3134SXin Li  #     2: Geometric (exponential)
73*2abb3134SXin Li  #     3: Step function
74*2abb3134SXin Li  #
75*2abb3134SXin Li  # Returns:
76*2abb3134SXin Li  #   Vector of strings for each individual in the population
77*2abb3134SXin Li
78*2abb3134SXin Li  strs <- sapply(1:num_strs, function(i) GenerateString(str_len))
79*2abb3134SXin Li
80*2abb3134SXin Li  if (distribution == 1) {
81*2abb3134SXin Li    # Zipfian-ish distribution
82*2abb3134SXin Li    prob <- (1:num_strs)^20
83*2abb3134SXin Li    prob <- prob / sum(prob) + 0.001
84*2abb3134SXin Li    prob <- prob / sum(prob)
85*2abb3134SXin Li  } else if (distribution == 2) {
86*2abb3134SXin Li    # Geometric distribution (discrete approximation to exponential)
87*2abb3134SXin Li    p <- 0.3
88*2abb3134SXin Li    prob <- p * (1 - p)^(1:num_strs - 1)
89*2abb3134SXin Li    prob <- prob / sum(prob)
90*2abb3134SXin Li  } else {
91*2abb3134SXin Li    # Uniform
92*2abb3134SXin Li    prob <- rep(1 / num_strs, num_strs)
93*2abb3134SXin Li  }
94*2abb3134SXin Li
95*2abb3134SXin Li  sample(strs, N, replace = TRUE, prob = prob)
96*2abb3134SXin Li}
97*2abb3134SXin Li
98*2abb3134SXin LiSelectNGrams <- function(str, num_ngrams, size, max_str_len = 6) {
99*2abb3134SXin Li  # Selects which ngrams each user will encode and then submit.
100*2abb3134SXin Li  #
101*2abb3134SXin Li  # Args:
102*2abb3134SXin Li  #   str: String from which ngram is built.
103*2abb3134SXin Li  #   num_ngrams: Number of ngrams to choose
104*2abb3134SXin Li  #   size: Number of characters per ngram
105*2abb3134SXin Li  #   max_str_len: Maximum number of characters in the string
106*2abb3134SXin Li  #
107*2abb3134SXin Li  # Returns:
108*2abb3134SXin Li  #   List of each individual's ngrams and which positions the ngrams
109*2abb3134SXin Li  #       were drawn from.
110*2abb3134SXin Li
111*2abb3134SXin Li  start <- sort(sample(seq(1, max_str_len, by = size), num_ngrams))
112*2abb3134SXin Li  ngrams <- mapply(function(x, y, str) substr(str, x, y),
113*2abb3134SXin Li                   start, start + size - 1,
114*2abb3134SXin Li                   MoreArgs = list(str = str))
115*2abb3134SXin Li  list(ngrams = ngrams, starts = start)
116*2abb3134SXin Li}
117*2abb3134SXin Li
118*2abb3134SXin LiUpdateMapWithCandidates <- function(str_candidates, sim, params) {
119*2abb3134SXin Li  # Generates a new map based on the returned candidates.
120*2abb3134SXin Li  #     Normally this would be created on the spot by having the
121*2abb3134SXin Li  #     aggregator hash the string candidates. But since we already have
122*2abb3134SXin Li  #     the map from simulation, we'll just choose the appropriate
123*2abb3134SXin Li  #     column
124*2abb3134SXin Li  #
125*2abb3134SXin Li  # Arguments:
126*2abb3134SXin Li  #   str_candidates: Vector of string candidates
127*2abb3134SXin Li  #   sim: Simulation object containing the original map
128*2abb3134SXin Li  #   params: RAPPOR parameter list
129*2abb3134SXin Li
130*2abb3134SXin Li  k <- params$k
131*2abb3134SXin Li  h <- params$h
132*2abb3134SXin Li  m <- params$m
133*2abb3134SXin Li
134*2abb3134SXin Li  # First add the real candidates to the map
135*2abb3134SXin Li  valid_cands <- intersect(str_candidates, colnames(sim$full_map$map_by_cohort[[1]]))
136*2abb3134SXin Li  updated_map <- sim$full_map
137*2abb3134SXin Li  updated_map$map_by_cohort <- lapply(1:m, function(i) {
138*2abb3134SXin Li    sim$full_map$map_by_cohort[[i]][, valid_cands]
139*2abb3134SXin Li  })
140*2abb3134SXin Li
141*2abb3134SXin Li  # Now add the false positives (we can just draw random strings for
142*2abb3134SXin Li  #     these since they didn't appear in the original dataset anyway)
143*2abb3134SXin Li  new_cands <- setdiff(str_candidates, colnames(sim$full_map$map_by_cohort[[1]]))
144*2abb3134SXin Li  M <- length(new_cands)
145*2abb3134SXin Li  if (M > 0) {
146*2abb3134SXin Li    for (i in 1:m) {
147*2abb3134SXin Li      ones <- sample(1:k, M * h, replace = TRUE)
148*2abb3134SXin Li      cols <- rep(1:M, each = h)
149*2abb3134SXin Li      strs <- c(sort(valid_cands), new_cands)
150*2abb3134SXin Li      updated_map$map_by_cohort[[i]] <-
151*2abb3134SXin Li          do.call(cBind, list(updated_map$map_by_cohort[[i]],
152*2abb3134SXin Li                              sparseMatrix(ones, cols, dims = c(k, M))))
153*2abb3134SXin Li      colnames(updated_map$map_by_cohort[[i]]) <- strs
154*2abb3134SXin Li    }
155*2abb3134SXin Li  }
156*2abb3134SXin Li  if (class(updated_map$map_by_cohort[[1]]) == "logical") {
157*2abb3134SXin Li    updated_map$all_cohorts_map <- unlist(updated_map$map_by_cohort)
158*2abb3134SXin Li    updated_map$all_cohorts_map <- Matrix(updated_map$all_cohorts_map, sparse = TRUE)
159*2abb3134SXin Li    colnames(updated_map$all_cohorts_map) <- c(valid_cands, new_cands)
160*2abb3134SXin Li  } else {
161*2abb3134SXin Li    updated_map$all_cohorts_map <- do.call("rBind", updated_map$map_by_cohort)
162*2abb3134SXin Li  }
163*2abb3134SXin Li  updated_map
164*2abb3134SXin Li}
165*2abb3134SXin Li
166*2abb3134SXin LiSimulateNGrams <- function(N, ngram_params, str_len, num_strs = 10,
167*2abb3134SXin Li                           alphabet, params, distribution = 1) {
168*2abb3134SXin Li  # Simulates the creation and encoding of ngrams for each individual.
169*2abb3134SXin Li  #
170*2abb3134SXin Li  # Args:
171*2abb3134SXin Li  #   N: Number of individuals in the population
172*2abb3134SXin Li  #   ngram_params: Parameters about ngram size, etc.
173*2abb3134SXin Li  #   str_len: Length of each string
174*2abb3134SXin Li  #   num_strs: NUmber of strings in the dictionary
175*2abb3134SXin Li  #   alphabet: Alphabet used to generate strings
176*2abb3134SXin Li  #   params: RAPPOR parameters, like noise and cohorts
177*2abb3134SXin Li  #
178*2abb3134SXin Li  # Returns:
179*2abb3134SXin Li  #   List containing all the information needed for estimating and
180*2abb3134SXin Li  #       verifying the results.
181*2abb3134SXin Li
182*2abb3134SXin Li  # Get the list of strings for each user
183*2abb3134SXin Li  strs <- GeneratePopulation(N, num_strs = num_strs,
184*2abb3134SXin Li                             str_len = str_len,
185*2abb3134SXin Li                             distribution)
186*2abb3134SXin Li
187*2abb3134SXin Li  # Split them into ngrams and encode
188*2abb3134SXin Li  ngram <- lapply(strs, function(i)
189*2abb3134SXin Li                  SelectNGrams(i,
190*2abb3134SXin Li                               num_ngrams = ngram_params$num_ngrams_collected,
191*2abb3134SXin Li                               size = ngram_params$ngram_size,
192*2abb3134SXin Li                               max_str_len = str_len))
193*2abb3134SXin Li
194*2abb3134SXin Li  cands <- GenerateCandidates(alphabet, ngram_params$ngram_size)
195*2abb3134SXin Li  map <- CreateMap(cands, params, FALSE)
196*2abb3134SXin Li  cohorts <- sample(1:params$m, N, replace = TRUE)
197*2abb3134SXin Li
198*2abb3134SXin Li  g <- sapply(ngram, function(x) paste(x$starts, sep = "_",
199*2abb3134SXin Li                                       collapse = "_"))
200*2abb3134SXin Li  ug <- sort(unique(g))
201*2abb3134SXin Li  pairings <- t(sapply(ug, function(x)
202*2abb3134SXin Li                       sapply(strsplit(x, "_"), function(y) as.numeric(y))))
203*2abb3134SXin Li
204*2abb3134SXin Li  inds <- lapply(1:length(ug), function(i) ind <- which(g == ug[i]))
205*2abb3134SXin Li
206*2abb3134SXin Li  reports <- lapply(1:length(ug), function(k) {
207*2abb3134SXin Li    # Generate the ngram reports
208*2abb3134SXin Li    lapply(1:ngram_params$num_ngrams_collected, function(x) {
209*2abb3134SXin Li      EncodeAll(sapply(inds[[k]], function(j) ngram[[j]]$ngrams[x]),
210*2abb3134SXin Li                cohorts[inds[[k]]], map$map_by_cohort, params)})
211*2abb3134SXin Li  })
212*2abb3134SXin Li  cat("Encoded the ngrams.\n")
213*2abb3134SXin Li  # Now generate the full string reports
214*2abb3134SXin Li  full_map <- CreateMap(sort(unique(strs)), params, FALSE)
215*2abb3134SXin Li  full_reports <- EncodeAll(strs, cohorts, full_map$map_by_cohort, params)
216*2abb3134SXin Li
217*2abb3134SXin Li  list(reports = reports, cohorts = cohorts, ngram = ngram, map = map,
218*2abb3134SXin Li       strs = strs, pairings = pairings, inds = inds, cands = cands,
219*2abb3134SXin Li       full_reports = full_reports, full_map = full_map)
220*2abb3134SXin Li
221*2abb3134SXin Li}
222*2abb3134SXin Li
223*2abb3134SXin Li
224*2abb3134SXin LiEstimateDictionaryTrial <- function(N, str_len, num_strs,
225*2abb3134SXin Li                                    params, ngram_params,
226*2abb3134SXin Li                                    distribution = 3) {
227*2abb3134SXin Li  # Runs a single trial for simulation. Generates simulated reports,
228*2abb3134SXin Li  #     decodes them, and returns the result.
229*2abb3134SXin Li  #
230*2abb3134SXin Li  # Arguments:
231*2abb3134SXin Li  #   N: Number of users to simulation
232*2abb3134SXin Li  #   str_len: The length of strings to estimate
233*2abb3134SXin Li  #   num_strs: The number of strings in the dictionary
234*2abb3134SXin Li  #   params: RAPPOR parameter list
235*2abb3134SXin Li  #   ngram_params: Parameters related to the size of ngrams
236*2abb3134SXin Li  #   distribution: Tells what kind of distribution to use:
237*2abb3134SXin Li  #       1: Zipfian
238*2abb3134SXin Li  #       2: Geometric
239*2abb3134SXin Li  #       3: Uniform (default)
240*2abb3134SXin Li  #
241*2abb3134SXin Li  # Returns:
242*2abb3134SXin Li  #   List with recovered and true marginals.
243*2abb3134SXin Li
244*2abb3134SXin Li  # We call the needed libraries here in order to make them available when this
245*2abb3134SXin Li  #     function gets called by BorgApply. Otherwise, they do not get included.
246*2abb3134SXin Li  library(glmnet)
247*2abb3134SXin Li  library(parallel)
248*2abb3134SXin Li  sim <- SimulateNGrams(N, ngram_params, str_len, num_strs = num_strs,
249*2abb3134SXin Li                        alphabet, params, distribution)
250*2abb3134SXin Li
251*2abb3134SXin Li  res <- EstimateDictionary(sim, N, ngram_params, params)
252*2abb3134SXin Li  str_candidates <- res$found_candidates
253*2abb3134SXin Li  pairwise_candidates <- res$pairwise_candidates
254*2abb3134SXin Li
255*2abb3134SXin Li  if (length(str_candidates) == 0) {
256*2abb3134SXin Li    return (NULL)
257*2abb3134SXin Li  }
258*2abb3134SXin Li  updated_map <- UpdateMapWithCandidates(str_candidates, sim, params)
259*2abb3134SXin Li
260*2abb3134SXin Li  # Compute the marginal for this new set of strings
261*2abb3134SXin Li  variable_counts <- ComputeCounts(sim$full_reports, sim$cohorts, params)
262*2abb3134SXin Li  # Our dictionary estimate
263*2abb3134SXin Li  marginal <- Decode(variable_counts, updated_map$all_cohorts_map, params)$fit
264*2abb3134SXin Li  # Estimate given full dictionary knowledge
265*2abb3134SXin Li  marginal_full <- Decode(variable_counts, sim$full_map$all_cohorts_map, params)$fit
266*2abb3134SXin Li  # The true (sampled) data distribution
267*2abb3134SXin Li  truth <- sort(table(sim$strs)) / N
268*2abb3134SXin Li
269*2abb3134SXin Li  list(marginal = marginal, marginal_full = marginal_full,
270*2abb3134SXin Li       truth = truth, pairwise_candidates = pairwise_candidates)
271*2abb3134SXin Li}
272