1*2abb3134SXin Li# Copyright 2014 Google Inc. All rights reserved. 2*2abb3134SXin Li# 3*2abb3134SXin Li# Licensed under the Apache License, Version 2.0 (the "License"); 4*2abb3134SXin Li# you may not use this file except in compliance with the License. 5*2abb3134SXin Li# You may obtain a copy of the License at 6*2abb3134SXin Li# 7*2abb3134SXin Li# http://www.apache.org/licenses/LICENSE-2.0 8*2abb3134SXin Li# 9*2abb3134SXin Li# Unless required by applicable law or agreed to in writing, software 10*2abb3134SXin Li# distributed under the License is distributed on an "AS IS" BASIS, 11*2abb3134SXin Li# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12*2abb3134SXin Li# See the License for the specific language governing permissions and 13*2abb3134SXin Li# limitations under the License. 14*2abb3134SXin Li 15*2abb3134SXin Li# Authors: [email protected] (Vasyl Pihur) and [email protected] (Giulia Fanti) 16*2abb3134SXin Li# 17*2abb3134SXin Li# Tools used to simulate sending partial ngrams to the server for estimating the 18*2abb3134SXin Li# dictionary of terms over which we want to learn a distribution. This 19*2abb3134SXin Li# mostly contains functions that aid in the generation of synthetic data. 20*2abb3134SXin Li 21*2abb3134SXin Lilibrary(RUnit) 22*2abb3134SXin Lilibrary(parallel) 23*2abb3134SXin Li 24*2abb3134SXin Lisource("analysis/R/encode.R") 25*2abb3134SXin Lisource("analysis/R/decode.R") 26*2abb3134SXin Lisource("analysis/R/simulation.R") 27*2abb3134SXin Lisource("analysis/R/association.R") 28*2abb3134SXin Lisource("analysis/R/decode_ngrams.R") 29*2abb3134SXin Li 30*2abb3134SXin Li# The alphabet is the set of all possible characters that will appear in a 31*2abb3134SXin Li# string. Here we use the English alphabet, but one might want to include 32*2abb3134SXin Li# numbers or punctuation marks. 33*2abb3134SXin Lialphabet <- letters 34*2abb3134SXin Li 35*2abb3134SXin LiGenerateCandidates <- function(alphabet, ngram_size = 2) { 36*2abb3134SXin Li # Draws a random string for each individual in the 37*2abb3134SXin Li # population from distribution. 38*2abb3134SXin Li # 39*2abb3134SXin Li # Args: 40*2abb3134SXin Li # N: Number of individuals in the population 41*2abb3134SXin Li # num_strs: Number of strings from which to draw strings 42*2abb3134SXin Li # str_len: Length of each string 43*2abb3134SXin Li # 44*2abb3134SXin Li # Returns: 45*2abb3134SXin Li # Vector of strings for each individual in the population 46*2abb3134SXin Li 47*2abb3134SXin Li cands <- do.call(expand.grid, lapply(seq(ngram_size), function(i) alphabet)) 48*2abb3134SXin Li apply(cands, 1, function(x) paste0(x, collapse = "")) 49*2abb3134SXin Li} 50*2abb3134SXin Li 51*2abb3134SXin LiGenerateString <- function(n) { 52*2abb3134SXin Li # Generates a string of a given length from the alphabet. 53*2abb3134SXin Li # 54*2abb3134SXin Li # Args: 55*2abb3134SXin Li # n: Number of characters in the string 56*2abb3134SXin Li # 57*2abb3134SXin Li # Returns: 58*2abb3134SXin Li # String of length n 59*2abb3134SXin Li paste0(sample(alphabet, n, replace = TRUE), collapse = "") 60*2abb3134SXin Li} 61*2abb3134SXin Li 62*2abb3134SXin LiGeneratePopulation <- function(N, num_strs, str_len = 10, 63*2abb3134SXin Li distribution = 1) { 64*2abb3134SXin Li # Generates a string for each individual in the population from distribution. 65*2abb3134SXin Li # 66*2abb3134SXin Li # Args: 67*2abb3134SXin Li # N: Number of individuals in the population 68*2abb3134SXin Li # num_strs: Number of strings from which to draw strings 69*2abb3134SXin Li # str_len: Length of each string 70*2abb3134SXin Li # distribution: which type of distribution to use 71*2abb3134SXin Li # 1: Zipfian 72*2abb3134SXin Li # 2: Geometric (exponential) 73*2abb3134SXin Li # 3: Step function 74*2abb3134SXin Li # 75*2abb3134SXin Li # Returns: 76*2abb3134SXin Li # Vector of strings for each individual in the population 77*2abb3134SXin Li 78*2abb3134SXin Li strs <- sapply(1:num_strs, function(i) GenerateString(str_len)) 79*2abb3134SXin Li 80*2abb3134SXin Li if (distribution == 1) { 81*2abb3134SXin Li # Zipfian-ish distribution 82*2abb3134SXin Li prob <- (1:num_strs)^20 83*2abb3134SXin Li prob <- prob / sum(prob) + 0.001 84*2abb3134SXin Li prob <- prob / sum(prob) 85*2abb3134SXin Li } else if (distribution == 2) { 86*2abb3134SXin Li # Geometric distribution (discrete approximation to exponential) 87*2abb3134SXin Li p <- 0.3 88*2abb3134SXin Li prob <- p * (1 - p)^(1:num_strs - 1) 89*2abb3134SXin Li prob <- prob / sum(prob) 90*2abb3134SXin Li } else { 91*2abb3134SXin Li # Uniform 92*2abb3134SXin Li prob <- rep(1 / num_strs, num_strs) 93*2abb3134SXin Li } 94*2abb3134SXin Li 95*2abb3134SXin Li sample(strs, N, replace = TRUE, prob = prob) 96*2abb3134SXin Li} 97*2abb3134SXin Li 98*2abb3134SXin LiSelectNGrams <- function(str, num_ngrams, size, max_str_len = 6) { 99*2abb3134SXin Li # Selects which ngrams each user will encode and then submit. 100*2abb3134SXin Li # 101*2abb3134SXin Li # Args: 102*2abb3134SXin Li # str: String from which ngram is built. 103*2abb3134SXin Li # num_ngrams: Number of ngrams to choose 104*2abb3134SXin Li # size: Number of characters per ngram 105*2abb3134SXin Li # max_str_len: Maximum number of characters in the string 106*2abb3134SXin Li # 107*2abb3134SXin Li # Returns: 108*2abb3134SXin Li # List of each individual's ngrams and which positions the ngrams 109*2abb3134SXin Li # were drawn from. 110*2abb3134SXin Li 111*2abb3134SXin Li start <- sort(sample(seq(1, max_str_len, by = size), num_ngrams)) 112*2abb3134SXin Li ngrams <- mapply(function(x, y, str) substr(str, x, y), 113*2abb3134SXin Li start, start + size - 1, 114*2abb3134SXin Li MoreArgs = list(str = str)) 115*2abb3134SXin Li list(ngrams = ngrams, starts = start) 116*2abb3134SXin Li} 117*2abb3134SXin Li 118*2abb3134SXin LiUpdateMapWithCandidates <- function(str_candidates, sim, params) { 119*2abb3134SXin Li # Generates a new map based on the returned candidates. 120*2abb3134SXin Li # Normally this would be created on the spot by having the 121*2abb3134SXin Li # aggregator hash the string candidates. But since we already have 122*2abb3134SXin Li # the map from simulation, we'll just choose the appropriate 123*2abb3134SXin Li # column 124*2abb3134SXin Li # 125*2abb3134SXin Li # Arguments: 126*2abb3134SXin Li # str_candidates: Vector of string candidates 127*2abb3134SXin Li # sim: Simulation object containing the original map 128*2abb3134SXin Li # params: RAPPOR parameter list 129*2abb3134SXin Li 130*2abb3134SXin Li k <- params$k 131*2abb3134SXin Li h <- params$h 132*2abb3134SXin Li m <- params$m 133*2abb3134SXin Li 134*2abb3134SXin Li # First add the real candidates to the map 135*2abb3134SXin Li valid_cands <- intersect(str_candidates, colnames(sim$full_map$map_by_cohort[[1]])) 136*2abb3134SXin Li updated_map <- sim$full_map 137*2abb3134SXin Li updated_map$map_by_cohort <- lapply(1:m, function(i) { 138*2abb3134SXin Li sim$full_map$map_by_cohort[[i]][, valid_cands] 139*2abb3134SXin Li }) 140*2abb3134SXin Li 141*2abb3134SXin Li # Now add the false positives (we can just draw random strings for 142*2abb3134SXin Li # these since they didn't appear in the original dataset anyway) 143*2abb3134SXin Li new_cands <- setdiff(str_candidates, colnames(sim$full_map$map_by_cohort[[1]])) 144*2abb3134SXin Li M <- length(new_cands) 145*2abb3134SXin Li if (M > 0) { 146*2abb3134SXin Li for (i in 1:m) { 147*2abb3134SXin Li ones <- sample(1:k, M * h, replace = TRUE) 148*2abb3134SXin Li cols <- rep(1:M, each = h) 149*2abb3134SXin Li strs <- c(sort(valid_cands), new_cands) 150*2abb3134SXin Li updated_map$map_by_cohort[[i]] <- 151*2abb3134SXin Li do.call(cBind, list(updated_map$map_by_cohort[[i]], 152*2abb3134SXin Li sparseMatrix(ones, cols, dims = c(k, M)))) 153*2abb3134SXin Li colnames(updated_map$map_by_cohort[[i]]) <- strs 154*2abb3134SXin Li } 155*2abb3134SXin Li } 156*2abb3134SXin Li if (class(updated_map$map_by_cohort[[1]]) == "logical") { 157*2abb3134SXin Li updated_map$all_cohorts_map <- unlist(updated_map$map_by_cohort) 158*2abb3134SXin Li updated_map$all_cohorts_map <- Matrix(updated_map$all_cohorts_map, sparse = TRUE) 159*2abb3134SXin Li colnames(updated_map$all_cohorts_map) <- c(valid_cands, new_cands) 160*2abb3134SXin Li } else { 161*2abb3134SXin Li updated_map$all_cohorts_map <- do.call("rBind", updated_map$map_by_cohort) 162*2abb3134SXin Li } 163*2abb3134SXin Li updated_map 164*2abb3134SXin Li} 165*2abb3134SXin Li 166*2abb3134SXin LiSimulateNGrams <- function(N, ngram_params, str_len, num_strs = 10, 167*2abb3134SXin Li alphabet, params, distribution = 1) { 168*2abb3134SXin Li # Simulates the creation and encoding of ngrams for each individual. 169*2abb3134SXin Li # 170*2abb3134SXin Li # Args: 171*2abb3134SXin Li # N: Number of individuals in the population 172*2abb3134SXin Li # ngram_params: Parameters about ngram size, etc. 173*2abb3134SXin Li # str_len: Length of each string 174*2abb3134SXin Li # num_strs: NUmber of strings in the dictionary 175*2abb3134SXin Li # alphabet: Alphabet used to generate strings 176*2abb3134SXin Li # params: RAPPOR parameters, like noise and cohorts 177*2abb3134SXin Li # 178*2abb3134SXin Li # Returns: 179*2abb3134SXin Li # List containing all the information needed for estimating and 180*2abb3134SXin Li # verifying the results. 181*2abb3134SXin Li 182*2abb3134SXin Li # Get the list of strings for each user 183*2abb3134SXin Li strs <- GeneratePopulation(N, num_strs = num_strs, 184*2abb3134SXin Li str_len = str_len, 185*2abb3134SXin Li distribution) 186*2abb3134SXin Li 187*2abb3134SXin Li # Split them into ngrams and encode 188*2abb3134SXin Li ngram <- lapply(strs, function(i) 189*2abb3134SXin Li SelectNGrams(i, 190*2abb3134SXin Li num_ngrams = ngram_params$num_ngrams_collected, 191*2abb3134SXin Li size = ngram_params$ngram_size, 192*2abb3134SXin Li max_str_len = str_len)) 193*2abb3134SXin Li 194*2abb3134SXin Li cands <- GenerateCandidates(alphabet, ngram_params$ngram_size) 195*2abb3134SXin Li map <- CreateMap(cands, params, FALSE) 196*2abb3134SXin Li cohorts <- sample(1:params$m, N, replace = TRUE) 197*2abb3134SXin Li 198*2abb3134SXin Li g <- sapply(ngram, function(x) paste(x$starts, sep = "_", 199*2abb3134SXin Li collapse = "_")) 200*2abb3134SXin Li ug <- sort(unique(g)) 201*2abb3134SXin Li pairings <- t(sapply(ug, function(x) 202*2abb3134SXin Li sapply(strsplit(x, "_"), function(y) as.numeric(y)))) 203*2abb3134SXin Li 204*2abb3134SXin Li inds <- lapply(1:length(ug), function(i) ind <- which(g == ug[i])) 205*2abb3134SXin Li 206*2abb3134SXin Li reports <- lapply(1:length(ug), function(k) { 207*2abb3134SXin Li # Generate the ngram reports 208*2abb3134SXin Li lapply(1:ngram_params$num_ngrams_collected, function(x) { 209*2abb3134SXin Li EncodeAll(sapply(inds[[k]], function(j) ngram[[j]]$ngrams[x]), 210*2abb3134SXin Li cohorts[inds[[k]]], map$map_by_cohort, params)}) 211*2abb3134SXin Li }) 212*2abb3134SXin Li cat("Encoded the ngrams.\n") 213*2abb3134SXin Li # Now generate the full string reports 214*2abb3134SXin Li full_map <- CreateMap(sort(unique(strs)), params, FALSE) 215*2abb3134SXin Li full_reports <- EncodeAll(strs, cohorts, full_map$map_by_cohort, params) 216*2abb3134SXin Li 217*2abb3134SXin Li list(reports = reports, cohorts = cohorts, ngram = ngram, map = map, 218*2abb3134SXin Li strs = strs, pairings = pairings, inds = inds, cands = cands, 219*2abb3134SXin Li full_reports = full_reports, full_map = full_map) 220*2abb3134SXin Li 221*2abb3134SXin Li} 222*2abb3134SXin Li 223*2abb3134SXin Li 224*2abb3134SXin LiEstimateDictionaryTrial <- function(N, str_len, num_strs, 225*2abb3134SXin Li params, ngram_params, 226*2abb3134SXin Li distribution = 3) { 227*2abb3134SXin Li # Runs a single trial for simulation. Generates simulated reports, 228*2abb3134SXin Li # decodes them, and returns the result. 229*2abb3134SXin Li # 230*2abb3134SXin Li # Arguments: 231*2abb3134SXin Li # N: Number of users to simulation 232*2abb3134SXin Li # str_len: The length of strings to estimate 233*2abb3134SXin Li # num_strs: The number of strings in the dictionary 234*2abb3134SXin Li # params: RAPPOR parameter list 235*2abb3134SXin Li # ngram_params: Parameters related to the size of ngrams 236*2abb3134SXin Li # distribution: Tells what kind of distribution to use: 237*2abb3134SXin Li # 1: Zipfian 238*2abb3134SXin Li # 2: Geometric 239*2abb3134SXin Li # 3: Uniform (default) 240*2abb3134SXin Li # 241*2abb3134SXin Li # Returns: 242*2abb3134SXin Li # List with recovered and true marginals. 243*2abb3134SXin Li 244*2abb3134SXin Li # We call the needed libraries here in order to make them available when this 245*2abb3134SXin Li # function gets called by BorgApply. Otherwise, they do not get included. 246*2abb3134SXin Li library(glmnet) 247*2abb3134SXin Li library(parallel) 248*2abb3134SXin Li sim <- SimulateNGrams(N, ngram_params, str_len, num_strs = num_strs, 249*2abb3134SXin Li alphabet, params, distribution) 250*2abb3134SXin Li 251*2abb3134SXin Li res <- EstimateDictionary(sim, N, ngram_params, params) 252*2abb3134SXin Li str_candidates <- res$found_candidates 253*2abb3134SXin Li pairwise_candidates <- res$pairwise_candidates 254*2abb3134SXin Li 255*2abb3134SXin Li if (length(str_candidates) == 0) { 256*2abb3134SXin Li return (NULL) 257*2abb3134SXin Li } 258*2abb3134SXin Li updated_map <- UpdateMapWithCandidates(str_candidates, sim, params) 259*2abb3134SXin Li 260*2abb3134SXin Li # Compute the marginal for this new set of strings 261*2abb3134SXin Li variable_counts <- ComputeCounts(sim$full_reports, sim$cohorts, params) 262*2abb3134SXin Li # Our dictionary estimate 263*2abb3134SXin Li marginal <- Decode(variable_counts, updated_map$all_cohorts_map, params)$fit 264*2abb3134SXin Li # Estimate given full dictionary knowledge 265*2abb3134SXin Li marginal_full <- Decode(variable_counts, sim$full_map$all_cohorts_map, params)$fit 266*2abb3134SXin Li # The true (sampled) data distribution 267*2abb3134SXin Li truth <- sort(table(sim$strs)) / N 268*2abb3134SXin Li 269*2abb3134SXin Li list(marginal = marginal, marginal_full = marginal_full, 270*2abb3134SXin Li truth = truth, pairwise_candidates = pairwise_candidates) 271*2abb3134SXin Li} 272