1*2abb3134SXin Li# Copyright 2014 Google Inc. All rights reserved. 2*2abb3134SXin Li# 3*2abb3134SXin Li# Licensed under the Apache License, Version 2.0 (the "License"); 4*2abb3134SXin Li# you may not use this file except in compliance with the License. 5*2abb3134SXin Li# You may obtain a copy of the License at 6*2abb3134SXin Li# 7*2abb3134SXin Li# http://www.apache.org/licenses/LICENSE-2.0 8*2abb3134SXin Li# 9*2abb3134SXin Li# Unless required by applicable law or agreed to in writing, software 10*2abb3134SXin Li# distributed under the License is distributed on an "AS IS" BASIS, 11*2abb3134SXin Li# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12*2abb3134SXin Li# See the License for the specific language governing permissions and 13*2abb3134SXin Li# limitations under the License. 14*2abb3134SXin Li 15*2abb3134SXin Li# 16*2abb3134SXin Li# Read parameter, counts and map files. 17*2abb3134SXin Li 18*2abb3134SXin Lilibrary(Matrix) 19*2abb3134SXin Li 20*2abb3134SXin Lisource.rappor <- function(rel_path) { 21*2abb3134SXin Li abs_path <- paste0(Sys.getenv("RAPPOR_REPO", ""), rel_path) 22*2abb3134SXin Li source(abs_path) 23*2abb3134SXin Li} 24*2abb3134SXin Li 25*2abb3134SXin Lisource.rappor("analysis/R/util.R") # for Log 26*2abb3134SXin Li 27*2abb3134SXin Li 28*2abb3134SXin LiReadParameterFile <- function(params_file) { 29*2abb3134SXin Li # Read parameter file. Format: 30*2abb3134SXin Li # k, h, m, p, q, f 31*2abb3134SXin Li # 128, 2, 8, 0.5, 0.75, 0.75 32*2abb3134SXin Li 33*2abb3134SXin Li params <- as.list(read.csv(params_file)) 34*2abb3134SXin Li if (length(params) != 6) { 35*2abb3134SXin Li stop("There should be exactly 6 columns in the parameter file.") 36*2abb3134SXin Li } 37*2abb3134SXin Li if (any(names(params) != c("k", "h", "m", "p", "q", "f"))) { 38*2abb3134SXin Li stop("Parameter names must be k,h,m,p,q,f.") 39*2abb3134SXin Li } 40*2abb3134SXin Li params 41*2abb3134SXin Li} 42*2abb3134SXin Li 43*2abb3134SXin Li# Handle the case of redundant cohorts, i.e. the counts file needs to be 44*2abb3134SXin Li# further aggregated to obtain counts for the number of cohorts specified in 45*2abb3134SXin Li# the params file. 46*2abb3134SXin Li# 47*2abb3134SXin Li# NOTE: Why is this happening? 48*2abb3134SXin LiAdjustCounts <- function(counts, params) { 49*2abb3134SXin Li apply(counts, 2, function(x) { 50*2abb3134SXin Li tapply(x, rep(1:params$m, nrow(counts) / params$m), sum) 51*2abb3134SXin Li }) 52*2abb3134SXin Li} 53*2abb3134SXin Li 54*2abb3134SXin LiReadCountsFile <- function(counts_file, params, adjust_counts = FALSE) { 55*2abb3134SXin Li # Read in the counts file. 56*2abb3134SXin Li if (!file.exists(counts_file)) { 57*2abb3134SXin Li return(NULL) 58*2abb3134SXin Li } 59*2abb3134SXin Li counts <- read.csv(counts_file, header = FALSE) 60*2abb3134SXin Li 61*2abb3134SXin Li if (adjust_counts) { 62*2abb3134SXin Li counts <- AdjustCounts(counts, params) 63*2abb3134SXin Li } 64*2abb3134SXin Li 65*2abb3134SXin Li if (nrow(counts) != params$m) { 66*2abb3134SXin Li stop(sprintf("Got %d rows in the counts file, expected m = %d", 67*2abb3134SXin Li nrow(counts), params$m)) 68*2abb3134SXin Li } 69*2abb3134SXin Li 70*2abb3134SXin Li if ((ncol(counts) - 1) != params$k) { 71*2abb3134SXin Li stop(paste0("Counts file: number of columns should equal to k + 1: ", 72*2abb3134SXin Li ncol(counts))) 73*2abb3134SXin Li } 74*2abb3134SXin Li 75*2abb3134SXin Li if (any(counts < 0)) { 76*2abb3134SXin Li stop("Counts file: all counts must be positive.") 77*2abb3134SXin Li } 78*2abb3134SXin Li 79*2abb3134SXin Li # Turn counts from a data frame into a matrix. (In R a data frame and matrix 80*2abb3134SXin Li # are sometimes interchangeable, but sometimes we need it to be matrix.) 81*2abb3134SXin Li as.matrix(counts) 82*2abb3134SXin Li} 83*2abb3134SXin Li 84*2abb3134SXin LiReadMapFile <- function(map_file, params) { 85*2abb3134SXin Li # Read in the map file which is in the following format (two hash functions): 86*2abb3134SXin Li # str1, h11, h12, h21 + k, h22 + k, h31 + 2k, h32 + 2k ... 87*2abb3134SXin Li # str2, ... 88*2abb3134SXin Li # Output: 89*2abb3134SXin Li # map: a sparse representation of set bits for each candidate string. 90*2abb3134SXin Li # strs: a vector of all candidate strings. 91*2abb3134SXin Li 92*2abb3134SXin Li Log("Parsing %s", map_file) 93*2abb3134SXin Li 94*2abb3134SXin Li map_pos <- read.csv(map_file, header = FALSE, as.is = TRUE) 95*2abb3134SXin Li strs <- map_pos[, 1] 96*2abb3134SXin Li strs[strs == ""] <- "Empty" 97*2abb3134SXin Li 98*2abb3134SXin Li # Remove duplicated strings. 99*2abb3134SXin Li ind <- which(!duplicated(strs)) 100*2abb3134SXin Li strs <- strs[ind] 101*2abb3134SXin Li map_pos <- map_pos[ind, ] 102*2abb3134SXin Li 103*2abb3134SXin Li n <- ncol(map_pos) - 1 104*2abb3134SXin Li if (n != (params$h * params$m)) { 105*2abb3134SXin Li stop(paste0("Map file: number of columns should equal hm + 1:", 106*2abb3134SXin Li n, "_", params$h * params$m)) 107*2abb3134SXin Li } 108*2abb3134SXin Li 109*2abb3134SXin Li row_pos <- unlist(map_pos[, -1], use.names = FALSE) 110*2abb3134SXin Li col_pos <- rep(1:nrow(map_pos), times = ncol(map_pos) - 1) 111*2abb3134SXin Li 112*2abb3134SXin Li # TODO: When would this ever happen? 113*2abb3134SXin Li removed <- which(is.na(row_pos)) 114*2abb3134SXin Li if (length(removed) > 0) { 115*2abb3134SXin Li Log("Removed %d entries", length(removed)) 116*2abb3134SXin Li row_pos <- row_pos[-removed] 117*2abb3134SXin Li col_pos <- col_pos[-removed] 118*2abb3134SXin Li } 119*2abb3134SXin Li 120*2abb3134SXin Li map <- sparseMatrix(row_pos, col_pos, 121*2abb3134SXin Li dims = c(params$m * params$k, length(strs))) 122*2abb3134SXin Li 123*2abb3134SXin Li colnames(map) <- strs 124*2abb3134SXin Li list(map = map, strs = strs, map_pos = map_pos) 125*2abb3134SXin Li} 126*2abb3134SXin Li 127*2abb3134SXin LiLoadMapFile <- function(map_file, params) { 128*2abb3134SXin Li # Reads the map file, caching an .rda (R binary data) version of it to speed 129*2abb3134SXin Li # up future loads. 130*2abb3134SXin Li 131*2abb3134SXin Li rda_path <- sub(".csv", ".rda", map_file, fixed = TRUE) 132*2abb3134SXin Li # This must be unique per process, so concurrent processes don't try to 133*2abb3134SXin Li # write the same file. 134*2abb3134SXin Li tmp_path <- sprintf("%s.%d", rda_path, Sys.getpid()) 135*2abb3134SXin Li 136*2abb3134SXin Li # First save to a temp file, and then atomically rename to the destination. 137*2abb3134SXin Li if (file.exists(rda_path)) { 138*2abb3134SXin Li Log("Loading %s", rda_path) 139*2abb3134SXin Li load(rda_path, .GlobalEnv) # creates the 'map' variable in the global env 140*2abb3134SXin Li } else { 141*2abb3134SXin Li map <- ReadMapFile(map_file, params) 142*2abb3134SXin Li 143*2abb3134SXin Li Log("Saving %s as an rda file for faster access", map_file) 144*2abb3134SXin Li tryCatch({ 145*2abb3134SXin Li save(map, file = tmp_path) 146*2abb3134SXin Li file.rename(tmp_path, rda_path) 147*2abb3134SXin Li }, warning = function(w) { 148*2abb3134SXin Li Log("WARNING: %s", w) 149*2abb3134SXin Li }, error = function(e) { 150*2abb3134SXin Li Log("ERROR: %s", e) 151*2abb3134SXin Li }) 152*2abb3134SXin Li } 153*2abb3134SXin Li return(map) 154*2abb3134SXin Li} 155