1# Copyright 2014 Google Inc. All rights reserved. 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14 15# Author: [email protected] (Giulia Fanti) 16# 17# Tests the unknown unknowns dictionary estimation functions. 18# There are two main components involved in estimating this unknown 19# distribution: 20# a) Find the pairwise ngrams that co-occur often. 21# b) Determine which full strings are consisted with all pairwise 22# relations. 23# 24# TestEstimateDictionary() tests the full pipeline, including parts (a) 25# and (b). 26# TestFindFeasibleStrings() tests only part (b). 27# Both tests generate their own data. 28 29library(parallel) 30source("analysis/R/encode.R") 31source("analysis/R/decode.R") 32source("analysis/R/simulation.R") 33source("analysis/R/association.R") 34source("analysis/R/decode_ngrams.R") 35source("analysis/R/ngrams_simulation.R") 36alphabet <- letters 37options(warn = -1) 38 39GeneratePopulation <- function(N, num_strs, str_len = 10, 40 distribution = NULL) { 41 # Generates a /deterministic/ string for each individual in the 42 # population from distribution. 43 # 44 # Args: 45 # N: Number of individuals in the population 46 # num_strs: Number of strings from which to draw strings 47 # str_len: Length of each string 48 # distribution: Just here for compatibility with original 49 # GeneratePopulation function in ngrams_simulation.R 50 # 51 # Returns: 52 # Vector of strings for each individual in the population 53 54 strs <- sapply(1:num_strs, function(i) { 55 paste0(alphabet[(str_len * (i - 1) + 1):(str_len * i)], collapse = "") 56 }) 57 58 # Uniform distribution 59 prob <- rep(1 / num_strs, num_strs) 60 sample(strs, N, replace = TRUE, prob = prob) 61} 62 63TestEstimateDictionary <- function() { 64 # Tests that the algorithm without noise recovers a uniform 65 # string population correctly. 66 67 # Compute the strings from measuring only 2 ngrams 68 N <- 100 69 str_len <- 6 70 ngram_size <- 2 71 num_ngrams <- str_len / ngram_size 72 num_strs <- 1 73 74 params <- list(k = 128, h = 4, m = 2, p = 0, q = 1, f = 0) 75 76 ngram_params <- list(ngram_size = ngram_size, num_ngrams = num_ngrams, 77 num_ngrams_collected = 2) 78 79 sim <- SimulateNGrams(N, ngram_params, str_len, num_strs = num_strs, 80 alphabet, params, distribution = 3) 81 82 res <- EstimateDictionary(sim, N, ngram_params, params) 83 84 # Check that the correct strings are found 85 if (num_strs == 1) { 86 checkTrue(res$found_candidates == sort(unique(sim$strs))) 87 } else { 88 checkTrue(all.equal(res$found_candidates, sort(unique(sim$strs)))) 89 } 90} 91 92TestFindFeasibleStrings <- function() { 93 # Tests that FindPairwiseCandidates weeds out false positives. 94 # We test this by adding false positives to the pairwise estimates. 95 N <- 100 96 str_len <- 6 97 ngram_size <- 2 98 num_ngrams <- str_len / ngram_size 99 num_strs <- 2 100 101 params <- list(k = 128, h = 4, m = 2, p = 0, q = 1, f = 0) 102 103 ngram_params <- list(ngram_size = ngram_size, num_ngrams = num_ngrams, 104 num_ngrams_collected = 2) 105 106 sim <- SimulateNGrams(N, ngram_params, str_len, num_strs = num_strs, 107 alphabet, params) 108 109 pairwise_candidates <- FindPairwiseCandidates(sim, N, ngram_params, 110 params)$candidate_strs 111 cat("Found the pairwise candidates. \n") 112 113 pairwise_candidates[[1]] <- rbind(pairwise_candidates[[1]], c("ab", "le")) 114 115 if (is.null(pairwise_candidates)) { 116 return (FALSE) 117 } 118 119 conn <- file('graph.txt', 'w+') 120 WriteKPartiteGraph(conn, 121 pairwise_candidates, 122 sim$pairings, 123 ngram_params$num_ngrams, 124 ngram_params$ngram_size) 125 126 close(conn) 127 cat("Wrote graph.txt\n") 128 129 found_candidates <- FindFeasibleStrings(pairwise_candidates, 130 sim$pairings, 131 ngram_params$num_ngrams, 132 ngram_params$ngram_size) 133 # Check that the correct strings are found 134 if (num_strs == 1) { 135 checkTrue(found_candidates == sort(unique(sim$strs))) 136 } else { 137 checkTrue(all.equal(found_candidates, sort(unique(sim$strs)))) 138 } 139} 140