xref: /aosp_15_r20/external/rappor/analysis/R/read_input.R (revision 2abb31345f6c95944768b5222a9a5ed3fc68cc00)
1*2abb3134SXin Li# Copyright 2014 Google Inc. All rights reserved.
2*2abb3134SXin Li#
3*2abb3134SXin Li# Licensed under the Apache License, Version 2.0 (the "License");
4*2abb3134SXin Li# you may not use this file except in compliance with the License.
5*2abb3134SXin Li# You may obtain a copy of the License at
6*2abb3134SXin Li#
7*2abb3134SXin Li#     http://www.apache.org/licenses/LICENSE-2.0
8*2abb3134SXin Li#
9*2abb3134SXin Li# Unless required by applicable law or agreed to in writing, software
10*2abb3134SXin Li# distributed under the License is distributed on an "AS IS" BASIS,
11*2abb3134SXin Li# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12*2abb3134SXin Li# See the License for the specific language governing permissions and
13*2abb3134SXin Li# limitations under the License.
14*2abb3134SXin Li
15*2abb3134SXin Li#
16*2abb3134SXin Li# Read parameter, counts and map files.
17*2abb3134SXin Li
18*2abb3134SXin Lilibrary(Matrix)
19*2abb3134SXin Li
20*2abb3134SXin Lisource.rappor <- function(rel_path)  {
21*2abb3134SXin Li  abs_path <- paste0(Sys.getenv("RAPPOR_REPO", ""), rel_path)
22*2abb3134SXin Li  source(abs_path)
23*2abb3134SXin Li}
24*2abb3134SXin Li
25*2abb3134SXin Lisource.rappor("analysis/R/util.R")  # for Log
26*2abb3134SXin Li
27*2abb3134SXin Li
28*2abb3134SXin LiReadParameterFile <- function(params_file) {
29*2abb3134SXin Li  # Read parameter file. Format:
30*2abb3134SXin Li  # k, h, m, p, q, f
31*2abb3134SXin Li  # 128, 2, 8, 0.5, 0.75, 0.75
32*2abb3134SXin Li
33*2abb3134SXin Li  params <- as.list(read.csv(params_file))
34*2abb3134SXin Li  if (length(params) != 6) {
35*2abb3134SXin Li    stop("There should be exactly 6 columns in the parameter file.")
36*2abb3134SXin Li  }
37*2abb3134SXin Li  if (any(names(params) != c("k", "h", "m", "p", "q", "f"))) {
38*2abb3134SXin Li    stop("Parameter names must be k,h,m,p,q,f.")
39*2abb3134SXin Li  }
40*2abb3134SXin Li  params
41*2abb3134SXin Li}
42*2abb3134SXin Li
43*2abb3134SXin Li# Handle the case of redundant cohorts, i.e. the counts file needs to be
44*2abb3134SXin Li# further aggregated to obtain counts for the number of cohorts specified in
45*2abb3134SXin Li# the params file.
46*2abb3134SXin Li#
47*2abb3134SXin Li# NOTE: Why is this happening?
48*2abb3134SXin LiAdjustCounts <- function(counts, params) {
49*2abb3134SXin Li  apply(counts, 2, function(x) {
50*2abb3134SXin Li    tapply(x, rep(1:params$m, nrow(counts) / params$m), sum)
51*2abb3134SXin Li  })
52*2abb3134SXin Li}
53*2abb3134SXin Li
54*2abb3134SXin LiReadCountsFile <- function(counts_file, params, adjust_counts = FALSE) {
55*2abb3134SXin Li  # Read in the counts file.
56*2abb3134SXin Li  if (!file.exists(counts_file)) {
57*2abb3134SXin Li    return(NULL)
58*2abb3134SXin Li  }
59*2abb3134SXin Li  counts <- read.csv(counts_file, header = FALSE)
60*2abb3134SXin Li
61*2abb3134SXin Li  if (adjust_counts) {
62*2abb3134SXin Li    counts <- AdjustCounts(counts, params)
63*2abb3134SXin Li  }
64*2abb3134SXin Li
65*2abb3134SXin Li  if (nrow(counts) != params$m) {
66*2abb3134SXin Li    stop(sprintf("Got %d rows in the counts file, expected m = %d",
67*2abb3134SXin Li                 nrow(counts), params$m))
68*2abb3134SXin Li  }
69*2abb3134SXin Li
70*2abb3134SXin Li  if ((ncol(counts) - 1) != params$k) {
71*2abb3134SXin Li    stop(paste0("Counts file: number of columns should equal to k + 1: ",
72*2abb3134SXin Li                ncol(counts)))
73*2abb3134SXin Li  }
74*2abb3134SXin Li
75*2abb3134SXin Li  if (any(counts < 0)) {
76*2abb3134SXin Li    stop("Counts file: all counts must be positive.")
77*2abb3134SXin Li  }
78*2abb3134SXin Li
79*2abb3134SXin Li  # Turn counts from a data frame into a matrix.  (In R a data frame and matrix
80*2abb3134SXin Li  # are sometimes interchangeable, but sometimes we need it to be matrix.)
81*2abb3134SXin Li  as.matrix(counts)
82*2abb3134SXin Li}
83*2abb3134SXin Li
84*2abb3134SXin LiReadMapFile <- function(map_file, params) {
85*2abb3134SXin Li  # Read in the map file which is in the following format (two hash functions):
86*2abb3134SXin Li  # str1, h11, h12, h21 + k, h22 + k, h31 + 2k, h32 + 2k ...
87*2abb3134SXin Li  # str2, ...
88*2abb3134SXin Li  # Output:
89*2abb3134SXin Li  #    map: a sparse representation of set bits for each candidate string.
90*2abb3134SXin Li  #    strs: a vector of all candidate strings.
91*2abb3134SXin Li
92*2abb3134SXin Li  Log("Parsing %s", map_file)
93*2abb3134SXin Li
94*2abb3134SXin Li  map_pos <- read.csv(map_file, header = FALSE, as.is = TRUE)
95*2abb3134SXin Li  strs <- map_pos[, 1]
96*2abb3134SXin Li  strs[strs == ""] <- "Empty"
97*2abb3134SXin Li
98*2abb3134SXin Li  # Remove duplicated strings.
99*2abb3134SXin Li  ind <- which(!duplicated(strs))
100*2abb3134SXin Li  strs <- strs[ind]
101*2abb3134SXin Li  map_pos <- map_pos[ind, ]
102*2abb3134SXin Li
103*2abb3134SXin Li  n <- ncol(map_pos) - 1
104*2abb3134SXin Li  if (n != (params$h * params$m)) {
105*2abb3134SXin Li    stop(paste0("Map file: number of columns should equal hm + 1:",
106*2abb3134SXin Li                n, "_", params$h * params$m))
107*2abb3134SXin Li  }
108*2abb3134SXin Li
109*2abb3134SXin Li  row_pos <- unlist(map_pos[, -1], use.names = FALSE)
110*2abb3134SXin Li  col_pos <- rep(1:nrow(map_pos), times = ncol(map_pos) - 1)
111*2abb3134SXin Li
112*2abb3134SXin Li  # TODO: When would this ever happen?
113*2abb3134SXin Li  removed <- which(is.na(row_pos))
114*2abb3134SXin Li  if (length(removed) > 0) {
115*2abb3134SXin Li    Log("Removed %d entries", length(removed))
116*2abb3134SXin Li    row_pos <- row_pos[-removed]
117*2abb3134SXin Li    col_pos <- col_pos[-removed]
118*2abb3134SXin Li  }
119*2abb3134SXin Li
120*2abb3134SXin Li  map <- sparseMatrix(row_pos, col_pos,
121*2abb3134SXin Li                      dims = c(params$m * params$k, length(strs)))
122*2abb3134SXin Li
123*2abb3134SXin Li  colnames(map) <- strs
124*2abb3134SXin Li  list(map = map, strs = strs, map_pos = map_pos)
125*2abb3134SXin Li}
126*2abb3134SXin Li
127*2abb3134SXin LiLoadMapFile <- function(map_file, params) {
128*2abb3134SXin Li  # Reads the map file, caching an .rda (R binary data) version of it to speed
129*2abb3134SXin Li  # up future loads.
130*2abb3134SXin Li
131*2abb3134SXin Li  rda_path <- sub(".csv", ".rda", map_file, fixed = TRUE)
132*2abb3134SXin Li  # This must be unique per process, so concurrent processes don't try to
133*2abb3134SXin Li  # write the same file.
134*2abb3134SXin Li  tmp_path <- sprintf("%s.%d", rda_path, Sys.getpid())
135*2abb3134SXin Li
136*2abb3134SXin Li  # First save to a temp file, and then atomically rename to the destination.
137*2abb3134SXin Li  if (file.exists(rda_path)) {
138*2abb3134SXin Li    Log("Loading %s", rda_path)
139*2abb3134SXin Li    load(rda_path, .GlobalEnv)  # creates the 'map' variable in the global env
140*2abb3134SXin Li  } else {
141*2abb3134SXin Li    map <- ReadMapFile(map_file, params)
142*2abb3134SXin Li
143*2abb3134SXin Li    Log("Saving %s as an rda file for faster access", map_file)
144*2abb3134SXin Li    tryCatch({
145*2abb3134SXin Li      save(map, file = tmp_path)
146*2abb3134SXin Li      file.rename(tmp_path, rda_path)
147*2abb3134SXin Li    }, warning = function(w) {
148*2abb3134SXin Li      Log("WARNING: %s", w)
149*2abb3134SXin Li    }, error = function(e) {
150*2abb3134SXin Li      Log("ERROR: %s", e)
151*2abb3134SXin Li    })
152*2abb3134SXin Li  }
153*2abb3134SXin Li  return(map)
154*2abb3134SXin Li}
155