1# Copyright 2014 Google Inc. All rights reserved. 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14 15# Authors: [email protected] (Vasyl Pihur) and [email protected] (Giulia Fanti) 16# 17# Tools used to simulate sending partial ngrams to the server for estimating the 18# dictionary of terms over which we want to learn a distribution. This 19# mostly contains functions that aid in the generation of synthetic data. 20 21library(RUnit) 22library(parallel) 23 24source("analysis/R/encode.R") 25source("analysis/R/decode.R") 26source("analysis/R/simulation.R") 27source("analysis/R/association.R") 28source("analysis/R/decode_ngrams.R") 29 30# The alphabet is the set of all possible characters that will appear in a 31# string. Here we use the English alphabet, but one might want to include 32# numbers or punctuation marks. 33alphabet <- letters 34 35GenerateCandidates <- function(alphabet, ngram_size = 2) { 36 # Draws a random string for each individual in the 37 # population from distribution. 38 # 39 # Args: 40 # N: Number of individuals in the population 41 # num_strs: Number of strings from which to draw strings 42 # str_len: Length of each string 43 # 44 # Returns: 45 # Vector of strings for each individual in the population 46 47 cands <- do.call(expand.grid, lapply(seq(ngram_size), function(i) alphabet)) 48 apply(cands, 1, function(x) paste0(x, collapse = "")) 49} 50 51GenerateString <- function(n) { 52 # Generates a string of a given length from the alphabet. 53 # 54 # Args: 55 # n: Number of characters in the string 56 # 57 # Returns: 58 # String of length n 59 paste0(sample(alphabet, n, replace = TRUE), collapse = "") 60} 61 62GeneratePopulation <- function(N, num_strs, str_len = 10, 63 distribution = 1) { 64 # Generates a string for each individual in the population from distribution. 65 # 66 # Args: 67 # N: Number of individuals in the population 68 # num_strs: Number of strings from which to draw strings 69 # str_len: Length of each string 70 # distribution: which type of distribution to use 71 # 1: Zipfian 72 # 2: Geometric (exponential) 73 # 3: Step function 74 # 75 # Returns: 76 # Vector of strings for each individual in the population 77 78 strs <- sapply(1:num_strs, function(i) GenerateString(str_len)) 79 80 if (distribution == 1) { 81 # Zipfian-ish distribution 82 prob <- (1:num_strs)^20 83 prob <- prob / sum(prob) + 0.001 84 prob <- prob / sum(prob) 85 } else if (distribution == 2) { 86 # Geometric distribution (discrete approximation to exponential) 87 p <- 0.3 88 prob <- p * (1 - p)^(1:num_strs - 1) 89 prob <- prob / sum(prob) 90 } else { 91 # Uniform 92 prob <- rep(1 / num_strs, num_strs) 93 } 94 95 sample(strs, N, replace = TRUE, prob = prob) 96} 97 98SelectNGrams <- function(str, num_ngrams, size, max_str_len = 6) { 99 # Selects which ngrams each user will encode and then submit. 100 # 101 # Args: 102 # str: String from which ngram is built. 103 # num_ngrams: Number of ngrams to choose 104 # size: Number of characters per ngram 105 # max_str_len: Maximum number of characters in the string 106 # 107 # Returns: 108 # List of each individual's ngrams and which positions the ngrams 109 # were drawn from. 110 111 start <- sort(sample(seq(1, max_str_len, by = size), num_ngrams)) 112 ngrams <- mapply(function(x, y, str) substr(str, x, y), 113 start, start + size - 1, 114 MoreArgs = list(str = str)) 115 list(ngrams = ngrams, starts = start) 116} 117 118UpdateMapWithCandidates <- function(str_candidates, sim, params) { 119 # Generates a new map based on the returned candidates. 120 # Normally this would be created on the spot by having the 121 # aggregator hash the string candidates. But since we already have 122 # the map from simulation, we'll just choose the appropriate 123 # column 124 # 125 # Arguments: 126 # str_candidates: Vector of string candidates 127 # sim: Simulation object containing the original map 128 # params: RAPPOR parameter list 129 130 k <- params$k 131 h <- params$h 132 m <- params$m 133 134 # First add the real candidates to the map 135 valid_cands <- intersect(str_candidates, colnames(sim$full_map$map_by_cohort[[1]])) 136 updated_map <- sim$full_map 137 updated_map$map_by_cohort <- lapply(1:m, function(i) { 138 sim$full_map$map_by_cohort[[i]][, valid_cands] 139 }) 140 141 # Now add the false positives (we can just draw random strings for 142 # these since they didn't appear in the original dataset anyway) 143 new_cands <- setdiff(str_candidates, colnames(sim$full_map$map_by_cohort[[1]])) 144 M <- length(new_cands) 145 if (M > 0) { 146 for (i in 1:m) { 147 ones <- sample(1:k, M * h, replace = TRUE) 148 cols <- rep(1:M, each = h) 149 strs <- c(sort(valid_cands), new_cands) 150 updated_map$map_by_cohort[[i]] <- 151 do.call(cBind, list(updated_map$map_by_cohort[[i]], 152 sparseMatrix(ones, cols, dims = c(k, M)))) 153 colnames(updated_map$map_by_cohort[[i]]) <- strs 154 } 155 } 156 if (class(updated_map$map_by_cohort[[1]]) == "logical") { 157 updated_map$all_cohorts_map <- unlist(updated_map$map_by_cohort) 158 updated_map$all_cohorts_map <- Matrix(updated_map$all_cohorts_map, sparse = TRUE) 159 colnames(updated_map$all_cohorts_map) <- c(valid_cands, new_cands) 160 } else { 161 updated_map$all_cohorts_map <- do.call("rBind", updated_map$map_by_cohort) 162 } 163 updated_map 164} 165 166SimulateNGrams <- function(N, ngram_params, str_len, num_strs = 10, 167 alphabet, params, distribution = 1) { 168 # Simulates the creation and encoding of ngrams for each individual. 169 # 170 # Args: 171 # N: Number of individuals in the population 172 # ngram_params: Parameters about ngram size, etc. 173 # str_len: Length of each string 174 # num_strs: NUmber of strings in the dictionary 175 # alphabet: Alphabet used to generate strings 176 # params: RAPPOR parameters, like noise and cohorts 177 # 178 # Returns: 179 # List containing all the information needed for estimating and 180 # verifying the results. 181 182 # Get the list of strings for each user 183 strs <- GeneratePopulation(N, num_strs = num_strs, 184 str_len = str_len, 185 distribution) 186 187 # Split them into ngrams and encode 188 ngram <- lapply(strs, function(i) 189 SelectNGrams(i, 190 num_ngrams = ngram_params$num_ngrams_collected, 191 size = ngram_params$ngram_size, 192 max_str_len = str_len)) 193 194 cands <- GenerateCandidates(alphabet, ngram_params$ngram_size) 195 map <- CreateMap(cands, params, FALSE) 196 cohorts <- sample(1:params$m, N, replace = TRUE) 197 198 g <- sapply(ngram, function(x) paste(x$starts, sep = "_", 199 collapse = "_")) 200 ug <- sort(unique(g)) 201 pairings <- t(sapply(ug, function(x) 202 sapply(strsplit(x, "_"), function(y) as.numeric(y)))) 203 204 inds <- lapply(1:length(ug), function(i) ind <- which(g == ug[i])) 205 206 reports <- lapply(1:length(ug), function(k) { 207 # Generate the ngram reports 208 lapply(1:ngram_params$num_ngrams_collected, function(x) { 209 EncodeAll(sapply(inds[[k]], function(j) ngram[[j]]$ngrams[x]), 210 cohorts[inds[[k]]], map$map_by_cohort, params)}) 211 }) 212 cat("Encoded the ngrams.\n") 213 # Now generate the full string reports 214 full_map <- CreateMap(sort(unique(strs)), params, FALSE) 215 full_reports <- EncodeAll(strs, cohorts, full_map$map_by_cohort, params) 216 217 list(reports = reports, cohorts = cohorts, ngram = ngram, map = map, 218 strs = strs, pairings = pairings, inds = inds, cands = cands, 219 full_reports = full_reports, full_map = full_map) 220 221} 222 223 224EstimateDictionaryTrial <- function(N, str_len, num_strs, 225 params, ngram_params, 226 distribution = 3) { 227 # Runs a single trial for simulation. Generates simulated reports, 228 # decodes them, and returns the result. 229 # 230 # Arguments: 231 # N: Number of users to simulation 232 # str_len: The length of strings to estimate 233 # num_strs: The number of strings in the dictionary 234 # params: RAPPOR parameter list 235 # ngram_params: Parameters related to the size of ngrams 236 # distribution: Tells what kind of distribution to use: 237 # 1: Zipfian 238 # 2: Geometric 239 # 3: Uniform (default) 240 # 241 # Returns: 242 # List with recovered and true marginals. 243 244 # We call the needed libraries here in order to make them available when this 245 # function gets called by BorgApply. Otherwise, they do not get included. 246 library(glmnet) 247 library(parallel) 248 sim <- SimulateNGrams(N, ngram_params, str_len, num_strs = num_strs, 249 alphabet, params, distribution) 250 251 res <- EstimateDictionary(sim, N, ngram_params, params) 252 str_candidates <- res$found_candidates 253 pairwise_candidates <- res$pairwise_candidates 254 255 if (length(str_candidates) == 0) { 256 return (NULL) 257 } 258 updated_map <- UpdateMapWithCandidates(str_candidates, sim, params) 259 260 # Compute the marginal for this new set of strings 261 variable_counts <- ComputeCounts(sim$full_reports, sim$cohorts, params) 262 # Our dictionary estimate 263 marginal <- Decode(variable_counts, updated_map$all_cohorts_map, params)$fit 264 # Estimate given full dictionary knowledge 265 marginal_full <- Decode(variable_counts, sim$full_map$all_cohorts_map, params)$fit 266 # The true (sampled) data distribution 267 truth <- sort(table(sim$strs)) / N 268 269 list(marginal = marginal, marginal_full = marginal_full, 270 truth = truth, pairwise_candidates = pairwise_candidates) 271} 272