1*2abb3134SXin Li# Copyright 2014 Google Inc. All rights reserved. 2*2abb3134SXin Li# 3*2abb3134SXin Li# Licensed under the Apache License, Version 2.0 (the "License"); 4*2abb3134SXin Li# you may not use this file except in compliance with the License. 5*2abb3134SXin Li# You may obtain a copy of the License at 6*2abb3134SXin Li# 7*2abb3134SXin Li# http://www.apache.org/licenses/LICENSE-2.0 8*2abb3134SXin Li# 9*2abb3134SXin Li# Unless required by applicable law or agreed to in writing, software 10*2abb3134SXin Li# distributed under the License is distributed on an "AS IS" BASIS, 11*2abb3134SXin Li# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12*2abb3134SXin Li# See the License for the specific language governing permissions and 13*2abb3134SXin Li# limitations under the License. 14*2abb3134SXin Li 15*2abb3134SXin Li# Author: [email protected] (Giulia Fanti) 16*2abb3134SXin Li# 17*2abb3134SXin Li# Tests the unknown unknowns dictionary estimation functions. 18*2abb3134SXin Li# There are two main components involved in estimating this unknown 19*2abb3134SXin Li# distribution: 20*2abb3134SXin Li# a) Find the pairwise ngrams that co-occur often. 21*2abb3134SXin Li# b) Determine which full strings are consisted with all pairwise 22*2abb3134SXin Li# relations. 23*2abb3134SXin Li# 24*2abb3134SXin Li# TestEstimateDictionary() tests the full pipeline, including parts (a) 25*2abb3134SXin Li# and (b). 26*2abb3134SXin Li# TestFindFeasibleStrings() tests only part (b). 27*2abb3134SXin Li# Both tests generate their own data. 28*2abb3134SXin Li 29*2abb3134SXin Lilibrary(parallel) 30*2abb3134SXin Lisource("analysis/R/encode.R") 31*2abb3134SXin Lisource("analysis/R/decode.R") 32*2abb3134SXin Lisource("analysis/R/simulation.R") 33*2abb3134SXin Lisource("analysis/R/association.R") 34*2abb3134SXin Lisource("analysis/R/decode_ngrams.R") 35*2abb3134SXin Lisource("analysis/R/ngrams_simulation.R") 36*2abb3134SXin Lialphabet <- letters 37*2abb3134SXin Lioptions(warn = -1) 38*2abb3134SXin Li 39*2abb3134SXin LiGeneratePopulation <- function(N, num_strs, str_len = 10, 40*2abb3134SXin Li distribution = NULL) { 41*2abb3134SXin Li # Generates a /deterministic/ string for each individual in the 42*2abb3134SXin Li # population from distribution. 43*2abb3134SXin Li # 44*2abb3134SXin Li # Args: 45*2abb3134SXin Li # N: Number of individuals in the population 46*2abb3134SXin Li # num_strs: Number of strings from which to draw strings 47*2abb3134SXin Li # str_len: Length of each string 48*2abb3134SXin Li # distribution: Just here for compatibility with original 49*2abb3134SXin Li # GeneratePopulation function in ngrams_simulation.R 50*2abb3134SXin Li # 51*2abb3134SXin Li # Returns: 52*2abb3134SXin Li # Vector of strings for each individual in the population 53*2abb3134SXin Li 54*2abb3134SXin Li strs <- sapply(1:num_strs, function(i) { 55*2abb3134SXin Li paste0(alphabet[(str_len * (i - 1) + 1):(str_len * i)], collapse = "") 56*2abb3134SXin Li }) 57*2abb3134SXin Li 58*2abb3134SXin Li # Uniform distribution 59*2abb3134SXin Li prob <- rep(1 / num_strs, num_strs) 60*2abb3134SXin Li sample(strs, N, replace = TRUE, prob = prob) 61*2abb3134SXin Li} 62*2abb3134SXin Li 63*2abb3134SXin LiTestEstimateDictionary <- function() { 64*2abb3134SXin Li # Tests that the algorithm without noise recovers a uniform 65*2abb3134SXin Li # string population correctly. 66*2abb3134SXin Li 67*2abb3134SXin Li # Compute the strings from measuring only 2 ngrams 68*2abb3134SXin Li N <- 100 69*2abb3134SXin Li str_len <- 6 70*2abb3134SXin Li ngram_size <- 2 71*2abb3134SXin Li num_ngrams <- str_len / ngram_size 72*2abb3134SXin Li num_strs <- 1 73*2abb3134SXin Li 74*2abb3134SXin Li params <- list(k = 128, h = 4, m = 2, p = 0, q = 1, f = 0) 75*2abb3134SXin Li 76*2abb3134SXin Li ngram_params <- list(ngram_size = ngram_size, num_ngrams = num_ngrams, 77*2abb3134SXin Li num_ngrams_collected = 2) 78*2abb3134SXin Li 79*2abb3134SXin Li sim <- SimulateNGrams(N, ngram_params, str_len, num_strs = num_strs, 80*2abb3134SXin Li alphabet, params, distribution = 3) 81*2abb3134SXin Li 82*2abb3134SXin Li res <- EstimateDictionary(sim, N, ngram_params, params) 83*2abb3134SXin Li 84*2abb3134SXin Li # Check that the correct strings are found 85*2abb3134SXin Li if (num_strs == 1) { 86*2abb3134SXin Li checkTrue(res$found_candidates == sort(unique(sim$strs))) 87*2abb3134SXin Li } else { 88*2abb3134SXin Li checkTrue(all.equal(res$found_candidates, sort(unique(sim$strs)))) 89*2abb3134SXin Li } 90*2abb3134SXin Li} 91*2abb3134SXin Li 92*2abb3134SXin LiTestFindFeasibleStrings <- function() { 93*2abb3134SXin Li # Tests that FindPairwiseCandidates weeds out false positives. 94*2abb3134SXin Li # We test this by adding false positives to the pairwise estimates. 95*2abb3134SXin Li N <- 100 96*2abb3134SXin Li str_len <- 6 97*2abb3134SXin Li ngram_size <- 2 98*2abb3134SXin Li num_ngrams <- str_len / ngram_size 99*2abb3134SXin Li num_strs <- 2 100*2abb3134SXin Li 101*2abb3134SXin Li params <- list(k = 128, h = 4, m = 2, p = 0, q = 1, f = 0) 102*2abb3134SXin Li 103*2abb3134SXin Li ngram_params <- list(ngram_size = ngram_size, num_ngrams = num_ngrams, 104*2abb3134SXin Li num_ngrams_collected = 2) 105*2abb3134SXin Li 106*2abb3134SXin Li sim <- SimulateNGrams(N, ngram_params, str_len, num_strs = num_strs, 107*2abb3134SXin Li alphabet, params) 108*2abb3134SXin Li 109*2abb3134SXin Li pairwise_candidates <- FindPairwiseCandidates(sim, N, ngram_params, 110*2abb3134SXin Li params)$candidate_strs 111*2abb3134SXin Li cat("Found the pairwise candidates. \n") 112*2abb3134SXin Li 113*2abb3134SXin Li pairwise_candidates[[1]] <- rbind(pairwise_candidates[[1]], c("ab", "le")) 114*2abb3134SXin Li 115*2abb3134SXin Li if (is.null(pairwise_candidates)) { 116*2abb3134SXin Li return (FALSE) 117*2abb3134SXin Li } 118*2abb3134SXin Li 119*2abb3134SXin Li conn <- file('graph.txt', 'w+') 120*2abb3134SXin Li WriteKPartiteGraph(conn, 121*2abb3134SXin Li pairwise_candidates, 122*2abb3134SXin Li sim$pairings, 123*2abb3134SXin Li ngram_params$num_ngrams, 124*2abb3134SXin Li ngram_params$ngram_size) 125*2abb3134SXin Li 126*2abb3134SXin Li close(conn) 127*2abb3134SXin Li cat("Wrote graph.txt\n") 128*2abb3134SXin Li 129*2abb3134SXin Li found_candidates <- FindFeasibleStrings(pairwise_candidates, 130*2abb3134SXin Li sim$pairings, 131*2abb3134SXin Li ngram_params$num_ngrams, 132*2abb3134SXin Li ngram_params$ngram_size) 133*2abb3134SXin Li # Check that the correct strings are found 134*2abb3134SXin Li if (num_strs == 1) { 135*2abb3134SXin Li checkTrue(found_candidates == sort(unique(sim$strs))) 136*2abb3134SXin Li } else { 137*2abb3134SXin Li checkTrue(all.equal(found_candidates, sort(unique(sim$strs)))) 138*2abb3134SXin Li } 139*2abb3134SXin Li} 140