1*2abb3134SXin Li# Copyright 2014 Google Inc. All rights reserved. 2*2abb3134SXin Li# 3*2abb3134SXin Li# Licensed under the Apache License, Version 2.0 (the "License"); 4*2abb3134SXin Li# you may not use this file except in compliance with the License. 5*2abb3134SXin Li# You may obtain a copy of the License at 6*2abb3134SXin Li# 7*2abb3134SXin Li# http://www.apache.org/licenses/LICENSE-2.0 8*2abb3134SXin Li# 9*2abb3134SXin Li# Unless required by applicable law or agreed to in writing, software 10*2abb3134SXin Li# distributed under the License is distributed on an "AS IS" BASIS, 11*2abb3134SXin Li# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12*2abb3134SXin Li# See the License for the specific language governing permissions and 13*2abb3134SXin Li# limitations under the License. 14*2abb3134SXin Li 15*2abb3134SXin Li# 16*2abb3134SXin Li# This file has functions that aid in the estimation of a distribution when the 17*2abb3134SXin Li# dictionary is unknown. There are functions for estimating pairwise joint 18*2abb3134SXin Li# ngram distributions, pruning out false positives, and combining the two 19*2abb3134SXin Li# steps. 20*2abb3134SXin Li 21*2abb3134SXin LiFindPairwiseCandidates <- function(report_data, N, ngram_params, params) { 22*2abb3134SXin Li # Finds the pairwise most likely ngrams. 23*2abb3134SXin Li # 24*2abb3134SXin Li # Args: 25*2abb3134SXin Li # report_data: Object containing data relevant to reports: 26*2abb3134SXin Li # $inds: The indices of reports collected using various pairs 27*2abb3134SXin Li # $cohorts: The cohort of each report 28*2abb3134SXin Li # $map: The map used for all the ngrams 29*2abb3134SXin Li # $reports: The reports used for each ngram and full string 30*2abb3134SXin Li # N: Number of reports collected 31*2abb3134SXin Li # ngram_params: Parameters related to ngram size 32*2abb3134SXin Li # params: Parameter list. 33*2abb3134SXin Li # 34*2abb3134SXin Li # Returns: 35*2abb3134SXin Li # List: list of matrices, list of pairwise distributions. 36*2abb3134SXin Li 37*2abb3134SXin Li inds <- report_data$inds 38*2abb3134SXin Li cohorts <- report_data$cohorts 39*2abb3134SXin Li num_ngrams_collected <- ngram_params$num_ngrams_collected 40*2abb3134SXin Li map <- report_data$map 41*2abb3134SXin Li reports <- report_data$reports 42*2abb3134SXin Li 43*2abb3134SXin Li # Cycle over all the unique pairs of ngrams being collected 44*2abb3134SXin Li found_candidates <- list() 45*2abb3134SXin Li 46*2abb3134SXin Li # Generate the map list to be used for all ngrams 47*2abb3134SXin Li maps <- lapply(1:num_ngrams_collected, function(x) map) 48*2abb3134SXin Li num_candidate_ngrams <- length(inds) 49*2abb3134SXin Li 50*2abb3134SXin Li .ComputeDist <- function(i, inds, cohorts, reports, maps, params, 51*2abb3134SXin Li num_ngrams_collected) { 52*2abb3134SXin Li library(glmnet) 53*2abb3134SXin Li ind <- inds[[i]] 54*2abb3134SXin Li cohort_subset <- lapply(1:num_ngrams_collected, function(x) 55*2abb3134SXin Li cohorts[ind]) 56*2abb3134SXin Li report_subset <- reports[[i]] 57*2abb3134SXin Li new_dist <- ComputeDistributionEM(report_subset, 58*2abb3134SXin Li cohort_subset, 59*2abb3134SXin Li maps, ignore_other = FALSE, 60*2abb3134SXin Li params = params, estimate_var = FALSE) 61*2abb3134SXin Li new_dist 62*2abb3134SXin Li } 63*2abb3134SXin Li 64*2abb3134SXin Li # Compute the pairwise distributions (could be parallelized) 65*2abb3134SXin Li dists <- lapply(seq(num_candidate_ngrams), function(i) 66*2abb3134SXin Li .ComputeDist(i, inds, cohorts, reports, maps, 67*2abb3134SXin Li params, num_ngrams_collected)) 68*2abb3134SXin Li 69*2abb3134SXin Li dists_null <- sapply(dists, function(x) is.null(x)) 70*2abb3134SXin Li if (any(dists_null)) { 71*2abb3134SXin Li return (list(found_candidates = list(), dists = dists)) 72*2abb3134SXin Li } 73*2abb3134SXin Li cat("Found the pairwise ngram distributions.\n") 74*2abb3134SXin Li 75*2abb3134SXin Li # Find the threshold for choosing "significant" ngram pairs 76*2abb3134SXin Li f <- params$f; q <- params$q; p <- params$p 77*2abb3134SXin Li q2 <- .5 * f * (p + q) + (1 - f) * q 78*2abb3134SXin Li p2 <- .5 * f * (p + q) + (1 - f) * p 79*2abb3134SXin Li std_dev_counts <- sqrt(p2 * (1 - p2) * N) / (q2 - p2) 80*2abb3134SXin Li (threshold <- std_dev_counts / N) 81*2abb3134SXin Li threshold <- 0.04 82*2abb3134SXin Li 83*2abb3134SXin Li # Filter joints to remove infrequently co-occurring ngrams. 84*2abb3134SXin Li candidate_strs <- lapply(1:num_candidate_ngrams, function(i) { 85*2abb3134SXin Li fit <- dists[[i]]$fit 86*2abb3134SXin Li edges <- which(fit > threshold, arr.ind = TRUE, FALSE) 87*2abb3134SXin Li 88*2abb3134SXin Li # Recover the list of strings that seem significant 89*2abb3134SXin Li found_candidates <- sapply(1:ncol(edges), function(x) { 90*2abb3134SXin Li chunks <- sapply(edges[, x], 91*2abb3134SXin Li function(j) dimnames(fit)[[x]][j]) 92*2abb3134SXin Li chunks 93*2abb3134SXin Li }) 94*2abb3134SXin Li # sapply returns either "character" vector (for n=1) or a matrix. Convert 95*2abb3134SXin Li # it to a matrix. This can be seen as follows: 96*2abb3134SXin Li # 97*2abb3134SXin Li # > class(sapply(1:5, function(x) "a")) 98*2abb3134SXin Li # [1] "character" 99*2abb3134SXin Li # > class(sapply(1:5, function(x) c("a", "b"))) 100*2abb3134SXin Li # [1] "matrix" 101*2abb3134SXin Li found_candidates <- rbind(found_candidates) 102*2abb3134SXin Li 103*2abb3134SXin Li # Remove the "others" 104*2abb3134SXin Li others <- which(found_candidates == "Other") 105*2abb3134SXin Li if (length(others) > 0) { 106*2abb3134SXin Li other <- which(found_candidates == "Other", arr.ind = TRUE)[, 1] 107*2abb3134SXin Li # drop = FALSE necessary to keep it a matrix 108*2abb3134SXin Li found_candidates <- found_candidates[-other, , drop = FALSE] 109*2abb3134SXin Li } 110*2abb3134SXin Li 111*2abb3134SXin Li found_candidates 112*2abb3134SXin Li }) 113*2abb3134SXin Li if (any(lapply(found_candidates, function(x) length(x)) == 0)) { 114*2abb3134SXin Li return (NULL) 115*2abb3134SXin Li } 116*2abb3134SXin Li 117*2abb3134SXin Li list(candidate_strs = candidate_strs, dists = dists) 118*2abb3134SXin Li} 119*2abb3134SXin Li 120*2abb3134SXin LiFindFeasibleStrings <- function(found_candidates, pairings, num_ngrams, 121*2abb3134SXin Li ngram_size) { 122*2abb3134SXin Li # Uses the list of strings found by the pairwise comparisons to build 123*2abb3134SXin Li # a list of full feasible strings. This relies on the iterative, 124*2abb3134SXin Li # graph-based approach. 125*2abb3134SXin Li # 126*2abb3134SXin Li # Args: 127*2abb3134SXin Li # found_candidates: list of candidates found by each pairwise decoding 128*2abb3134SXin Li # pairings: Matrix of size 2x(num_ngrams choose 2) listing all the 129*2abb3134SXin Li # ngram position pairings. 130*2abb3134SXin Li # num_ngrams: The total number of ngrams per word. 131*2abb3134SXin Li # ngram_size: Number of characters per ngram 132*2abb3134SXin Li # 133*2abb3134SXin Li # Returns: 134*2abb3134SXin Li # List of full string candidates. 135*2abb3134SXin Li 136*2abb3134SXin Li # Which ngram pairs are adjacent, i.e. of the form (i,i+1) 137*2abb3134SXin Li adjacent <- sapply(seq(num_ngrams - 1), function(x) { 138*2abb3134SXin Li c(1 + (x - 1) * ngram_size, x * ngram_size + 1) 139*2abb3134SXin Li }) 140*2abb3134SXin Li 141*2abb3134SXin Li adjacent_pairs <- apply(adjacent, 2, function(x) { 142*2abb3134SXin Li which(apply(pairings, 1, function(y) identical(y, x))) 143*2abb3134SXin Li }) 144*2abb3134SXin Li 145*2abb3134SXin Li # The first set of candidates are ngrams found in positions 1 and 2 146*2abb3134SXin Li active_cands <- found_candidates[[adjacent_pairs[1]]] 147*2abb3134SXin Li if (class(active_cands) == "list") { 148*2abb3134SXin Li return (list()) 149*2abb3134SXin Li } else { 150*2abb3134SXin Li active_cands <- as.data.frame(active_cands) 151*2abb3134SXin Li } 152*2abb3134SXin Li 153*2abb3134SXin Li # Now check successive ngrams to find consistent combinations 154*2abb3134SXin Li # i.e. after ngrams 1-2, check 2-3, 3-4, 4-5, etc. 155*2abb3134SXin Li for (i in 2:length(adjacent_pairs)) { 156*2abb3134SXin Li if (nrow(active_cands) == 0) { 157*2abb3134SXin Li return (list()) 158*2abb3134SXin Li } 159*2abb3134SXin Li new_cands <- found_candidates[[adjacent_pairs[i]]] 160*2abb3134SXin Li new_cands <- as.data.frame(new_cands) 161*2abb3134SXin Li # Builds the set of possible candidates based only on ascending 162*2abb3134SXin Li # candidate pairs 163*2abb3134SXin Li active_cands <- BuildCandidates(active_cands, new_cands) 164*2abb3134SXin Li } 165*2abb3134SXin Li 166*2abb3134SXin Li if (nrow(active_cands) == 0) { 167*2abb3134SXin Li return (list()) 168*2abb3134SXin Li } 169*2abb3134SXin Li # Now refine these candidates using non-adjacent bigrams 170*2abb3134SXin Li remaining <- (1:(num_ngrams * (num_ngrams - 1) / 2))[-c(1, adjacent_pairs)] 171*2abb3134SXin Li # For each non-adjacent pair, make sure that all the candidates are 172*2abb3134SXin Li # consistent (in this phase, candidates can ONLY be eliminated) 173*2abb3134SXin Li 174*2abb3134SXin Li for (i in remaining) { 175*2abb3134SXin Li new_cands <- found_candidates[[i]] 176*2abb3134SXin Li new_cands <- as.data.frame(new_cands) 177*2abb3134SXin Li # Prune out all candidates that do not agree with new_cands 178*2abb3134SXin Li active_cands <- PruneCandidates(active_cands, pairings[i, ], 179*2abb3134SXin Li ngram_size, 180*2abb3134SXin Li new_cands = new_cands) 181*2abb3134SXin Li } 182*2abb3134SXin Li # Consolidate the string ngrams into a full string representation 183*2abb3134SXin Li if (length(active_cands) > 0) { 184*2abb3134SXin Li active_cands <- sort(apply(active_cands, 1, 185*2abb3134SXin Li function(x) paste0(x, collapse = ""))) 186*2abb3134SXin Li } 187*2abb3134SXin Li unname(active_cands) 188*2abb3134SXin Li} 189*2abb3134SXin Li 190*2abb3134SXin LiBuildCandidates <- function(active_cands, new_cands) { 191*2abb3134SXin Li # Takes in a data frame where each row is a valid sequence of ngrams 192*2abb3134SXin Li # checks which of the new_cands ngram pairs are consistent with 193*2abb3134SXin Li # the original active_cands ngram sequence. 194*2abb3134SXin Li # 195*2abb3134SXin Li # Args: 196*2abb3134SXin Li # active_cands: data frame of ngram sequence candidates (1 candidate 197*2abb3134SXin Li # sequence per row) 198*2abb3134SXin Li # new_cands: An rx2 data frame with a new list of candidate ngram 199*2abb3134SXin Li # pairs that might fit in with the previous list of candidates 200*2abb3134SXin Li # 201*2abb3134SXin Li # Returns: 202*2abb3134SXin Li # Updated active_cands, with another column if valid extensions are 203*2abb3134SXin Li # found. 204*2abb3134SXin Li 205*2abb3134SXin Li # Get the trailing ngrams from the current candidates 206*2abb3134SXin Li to_check <- as.vector(tail(t(active_cands), n = 1)) 207*2abb3134SXin Li # Check which of the elements in to_check are leading ngrams among the 208*2abb3134SXin Li # new candidates 209*2abb3134SXin Li present <- sapply(to_check, function(x) any(x == new_cands)) 210*2abb3134SXin Li # Remove the strings that are not represented among the new candidates 211*2abb3134SXin Li to_check <- to_check[present] 212*2abb3134SXin Li # Now insert the new candidates where they belong 213*2abb3134SXin Li active_cands <- active_cands[present, , drop = FALSE] 214*2abb3134SXin Li active_cands <- cbind(active_cands, col = NA) 215*2abb3134SXin Li num_cands <- nrow(active_cands) 216*2abb3134SXin Li hit_list <- c() 217*2abb3134SXin Li for (j in 1:num_cands) { 218*2abb3134SXin Li inds <- which(new_cands[, 1] == to_check[j]) 219*2abb3134SXin Li if (length(inds) == 0) { 220*2abb3134SXin Li hit_list <- c(hit_list, j) 221*2abb3134SXin Li next 222*2abb3134SXin Li } 223*2abb3134SXin Li # If there are multiple candidates fitting with an ngram, include 224*2abb3134SXin Li # each /full/ string as a candidate 225*2abb3134SXin Li extra <- length(inds) - 1 226*2abb3134SXin Li if (extra > 0) { 227*2abb3134SXin Li rep_inds <- c(j, (new_num_cands + 1):(new_num_cands + extra)) 228*2abb3134SXin Li to_paste <- active_cands[j, ] 229*2abb3134SXin Li # Add the new candidates to the bottom 230*2abb3134SXin Li for (p in 1:extra) { 231*2abb3134SXin Li active_cands <- rbind(active_cands, to_paste) 232*2abb3134SXin Li } 233*2abb3134SXin Li } else { 234*2abb3134SXin Li rep_inds <- c(j) 235*2abb3134SXin Li } 236*2abb3134SXin Li active_cands[rep_inds, ncol(active_cands)] <- 237*2abb3134SXin Li as.vector(new_cands[inds, 2]) 238*2abb3134SXin Li new_num_cands <- nrow(active_cands) 239*2abb3134SXin Li } 240*2abb3134SXin Li # If there were some false candidates in the original set, remove them 241*2abb3134SXin Li if (length(hit_list) > 0) { 242*2abb3134SXin Li active_cands <- active_cands[-hit_list, , drop = FALSE] 243*2abb3134SXin Li } 244*2abb3134SXin Li active_cands 245*2abb3134SXin Li} 246*2abb3134SXin Li 247*2abb3134SXin LiPruneCandidates <- function(active_cands, pairing, ngram_size, new_cands) { 248*2abb3134SXin Li # Takes in a data frame where each row is a valid sequence of ngrams 249*2abb3134SXin Li # checks which of the new_cands ngram pairs are consistent with 250*2abb3134SXin Li # the original active_cands ngram sequence. This can ONLY remove 251*2abb3134SXin Li # candidates presented in active_cands. 252*2abb3134SXin Li # 253*2abb3134SXin Li # Args: 254*2abb3134SXin Li # active_cands: data frame of ngram sequence candidates (1 candidate 255*2abb3134SXin Li # sequence per row) 256*2abb3134SXin Li # pairing: A length-2 list storing which two ngrams are measured 257*2abb3134SXin Li # ngram_size: Number of characters per ngram 258*2abb3134SXin Li # new_cands: An rx2 data frame with a new list of candidate ngram 259*2abb3134SXin Li # pairs that might fit in with the previous list of candidates 260*2abb3134SXin Li # 261*2abb3134SXin Li # Returns: 262*2abb3134SXin Li # Updated active_cands, with a reduced number of rows. 263*2abb3134SXin Li 264*2abb3134SXin Li # Convert the pairing to an ngram index 265*2abb3134SXin Li cols <- sapply(pairing, function(x) (x - 1) / ngram_size + 1) 266*2abb3134SXin Li 267*2abb3134SXin Li cands_to_check <- active_cands[, cols, drop = FALSE] 268*2abb3134SXin Li # Find the candidates that are inconsistent with the new data 269*2abb3134SXin Li hit_list <- sapply(1:nrow(cands_to_check), function(j) { 270*2abb3134SXin Li to_kill <- FALSE 271*2abb3134SXin Li if (nrow(new_cands) == 0) { 272*2abb3134SXin Li return (TRUE) 273*2abb3134SXin Li } 274*2abb3134SXin Li if (!any(apply(new_cands, 1, function(x) 275*2abb3134SXin Li all(cands_to_check[j, , drop = FALSE] == x)))) { 276*2abb3134SXin Li to_kill <- TRUE 277*2abb3134SXin Li } 278*2abb3134SXin Li to_kill 279*2abb3134SXin Li }) 280*2abb3134SXin Li 281*2abb3134SXin Li # Determine which rows are false positives 282*2abb3134SXin Li hit_indices <- which(hit_list) 283*2abb3134SXin Li # Remove the false positives 284*2abb3134SXin Li if (length(hit_indices) > 0) { 285*2abb3134SXin Li active_cands <- active_cands[-hit_indices, ] 286*2abb3134SXin Li } 287*2abb3134SXin Li active_cands 288*2abb3134SXin Li} 289*2abb3134SXin Li 290*2abb3134SXin LiEstimateDictionary <- function(report_data, N, ngram_params, params) { 291*2abb3134SXin Li # Takes in a list of report data and returns a list of string 292*2abb3134SXin Li # estimates of the dictionary. 293*2abb3134SXin Li # 294*2abb3134SXin Li # Args: 295*2abb3134SXin Li # report_data: Object containing data relevant to reports: 296*2abb3134SXin Li # $inds: The indices of reports collected using various pairs 297*2abb3134SXin Li # $cohorts: The cohort of each report 298*2abb3134SXin Li # $map: THe map used for all the ngrams 299*2abb3134SXin Li # $reports: The reports used for each ngram and full string 300*2abb3134SXin Li # N: the number of individuals sending reports 301*2abb3134SXin Li # ngram_params: Parameters related to ngram length, etc 302*2abb3134SXin Li # params: Parameter vector with RAPPOR noise levels, cohorts, etc 303*2abb3134SXin Li # 304*2abb3134SXin Li # Returns: 305*2abb3134SXin Li # List: list of found candidates, list of pairwise candidates 306*2abb3134SXin Li 307*2abb3134SXin Li pairwise_candidates <- FindPairwiseCandidates(report_data, N, 308*2abb3134SXin Li ngram_params, 309*2abb3134SXin Li params)$candidate_strs 310*2abb3134SXin Li cat("Found the pairwise candidates. \n") 311*2abb3134SXin Li if (is.null(pairwise_candidates)) { 312*2abb3134SXin Li return (list()) 313*2abb3134SXin Li } 314*2abb3134SXin Li found_candidates <- FindFeasibleStrings(pairwise_candidates, 315*2abb3134SXin Li report_data$pairings, 316*2abb3134SXin Li ngram_params$num_ngrams, 317*2abb3134SXin Li ngram_params$ngram_size) 318*2abb3134SXin Li cat("Found all the candidates. \n") 319*2abb3134SXin Li list(found_candidates = found_candidates, 320*2abb3134SXin Li pairwise_candidates = pairwise_candidates) 321*2abb3134SXin Li} 322*2abb3134SXin Li 323*2abb3134SXin LiWriteKPartiteGraph <- function(conn, pairwise_candidates, pairings, num_ngrams, 324*2abb3134SXin Li ngram_size) { 325*2abb3134SXin Li # Args: 326*2abb3134SXin Li # conn: R connection to write to. Should be opened with mode w+. 327*2abb3134SXin Li # pairwise_candidates: list of matrices. Each matrix represents a subgraph; 328*2abb3134SXin Li # it contains the edges between partitions i and j, so there are (k choose 329*2abb3134SXin Li # 2) matrices. Each matrix has dimension 2 x E, where E is the number of 330*2abb3134SXin Li # edges. 331*2abb3134SXin Li # pairings: 2 x (k choose 2) matrix of character positions. Each row 332*2abb3134SXin Li # corresponds to a subgraph; it has 1-based character index of partitions 333*2abb3134SXin Li # i and j. 334*2abb3134SXin Li # num_ngrams: length of pairwise_candidates, or the number of partitions in 335*2abb3134SXin Li # the k-partite graph 336*2abb3134SXin Li 337*2abb3134SXin Li # File Format: 338*2abb3134SXin Li # 339*2abb3134SXin Li # num_partitions 3 340*2abb3134SXin Li # ngram_size 2 341*2abb3134SXin Li # 0.ab 1.cd 342*2abb3134SXin Li # 0.ab 2.ef 343*2abb3134SXin Li # 344*2abb3134SXin Li # The first line specifies the number of partitions (k). 345*2abb3134SXin Li # The remaining lines are edges, where each node is <partition>.<bigram>. 346*2abb3134SXin Li # 347*2abb3134SXin Li # Partitions are numbered from 0. The partition of the left node will be 348*2abb3134SXin Li # less than the partition of the right node. 349*2abb3134SXin Li 350*2abb3134SXin Li # First two lines are metadata 351*2abb3134SXin Li cat(sprintf('num_partitions %d\n', num_ngrams), file = conn) 352*2abb3134SXin Li cat(sprintf('ngram_size %d\n', ngram_size), file = conn) 353*2abb3134SXin Li 354*2abb3134SXin Li for (i in 1:length(pairwise_candidates)) { 355*2abb3134SXin Li # The two pairwise_candidates for this subgraph. 356*2abb3134SXin Li # Turn 1-based character positions into 0-based partition numbers, 357*2abb3134SXin Li # e.g. (3, 5) -> (1, 2) 358*2abb3134SXin Li 359*2abb3134SXin Li pos1 <- pairings[[i, 1]] 360*2abb3134SXin Li pos2 <- pairings[[i, 2]] 361*2abb3134SXin Li part1 <- (pos1 - 1) / ngram_size 362*2abb3134SXin Li part2 <- (pos2 - 1) / ngram_size 363*2abb3134SXin Li cat(sprintf("Writing partition (%d, %d)\n", part1, part2)) 364*2abb3134SXin Li 365*2abb3134SXin Li p <- pairwise_candidates[[i]] 366*2abb3134SXin Li # each row is an edge 367*2abb3134SXin Li for (j in 1:nrow(p)) { 368*2abb3134SXin Li n1 <- p[[j, 1]] 369*2abb3134SXin Li n2 <- p[[j, 2]] 370*2abb3134SXin Li line <- sprintf('edge %d.%s %d.%s\n', part1, n1, part2, n2) 371*2abb3134SXin Li # NOTE: It would be faster to preallocate 'lines', but we would have to 372*2abb3134SXin Li # make a two passes through pairwise_candidates. 373*2abb3134SXin Li cat(line, file = conn) 374*2abb3134SXin Li } 375*2abb3134SXin Li } 376*2abb3134SXin Li} 377*2abb3134SXin Li 378