1*cda5da8dSAndroid Build Coastguard Worker 2*cda5da8dSAndroid Build Coastguard Worker""" 3*cda5da8dSAndroid Build Coastguard Workercsv.py - read/write/investigate CSV files 4*cda5da8dSAndroid Build Coastguard Worker""" 5*cda5da8dSAndroid Build Coastguard Worker 6*cda5da8dSAndroid Build Coastguard Workerimport re 7*cda5da8dSAndroid Build Coastguard Workerfrom _csv import Error, __version__, writer, reader, register_dialect, \ 8*cda5da8dSAndroid Build Coastguard Worker unregister_dialect, get_dialect, list_dialects, \ 9*cda5da8dSAndroid Build Coastguard Worker field_size_limit, \ 10*cda5da8dSAndroid Build Coastguard Worker QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \ 11*cda5da8dSAndroid Build Coastguard Worker __doc__ 12*cda5da8dSAndroid Build Coastguard Workerfrom _csv import Dialect as _Dialect 13*cda5da8dSAndroid Build Coastguard Worker 14*cda5da8dSAndroid Build Coastguard Workerfrom io import StringIO 15*cda5da8dSAndroid Build Coastguard Worker 16*cda5da8dSAndroid Build Coastguard Worker__all__ = ["QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE", 17*cda5da8dSAndroid Build Coastguard Worker "Error", "Dialect", "__doc__", "excel", "excel_tab", 18*cda5da8dSAndroid Build Coastguard Worker "field_size_limit", "reader", "writer", 19*cda5da8dSAndroid Build Coastguard Worker "register_dialect", "get_dialect", "list_dialects", "Sniffer", 20*cda5da8dSAndroid Build Coastguard Worker "unregister_dialect", "__version__", "DictReader", "DictWriter", 21*cda5da8dSAndroid Build Coastguard Worker "unix_dialect"] 22*cda5da8dSAndroid Build Coastguard Worker 23*cda5da8dSAndroid Build Coastguard Workerclass Dialect: 24*cda5da8dSAndroid Build Coastguard Worker """Describe a CSV dialect. 25*cda5da8dSAndroid Build Coastguard Worker 26*cda5da8dSAndroid Build Coastguard Worker This must be subclassed (see csv.excel). Valid attributes are: 27*cda5da8dSAndroid Build Coastguard Worker delimiter, quotechar, escapechar, doublequote, skipinitialspace, 28*cda5da8dSAndroid Build Coastguard Worker lineterminator, quoting. 29*cda5da8dSAndroid Build Coastguard Worker 30*cda5da8dSAndroid Build Coastguard Worker """ 31*cda5da8dSAndroid Build Coastguard Worker _name = "" 32*cda5da8dSAndroid Build Coastguard Worker _valid = False 33*cda5da8dSAndroid Build Coastguard Worker # placeholders 34*cda5da8dSAndroid Build Coastguard Worker delimiter = None 35*cda5da8dSAndroid Build Coastguard Worker quotechar = None 36*cda5da8dSAndroid Build Coastguard Worker escapechar = None 37*cda5da8dSAndroid Build Coastguard Worker doublequote = None 38*cda5da8dSAndroid Build Coastguard Worker skipinitialspace = None 39*cda5da8dSAndroid Build Coastguard Worker lineterminator = None 40*cda5da8dSAndroid Build Coastguard Worker quoting = None 41*cda5da8dSAndroid Build Coastguard Worker 42*cda5da8dSAndroid Build Coastguard Worker def __init__(self): 43*cda5da8dSAndroid Build Coastguard Worker if self.__class__ != Dialect: 44*cda5da8dSAndroid Build Coastguard Worker self._valid = True 45*cda5da8dSAndroid Build Coastguard Worker self._validate() 46*cda5da8dSAndroid Build Coastguard Worker 47*cda5da8dSAndroid Build Coastguard Worker def _validate(self): 48*cda5da8dSAndroid Build Coastguard Worker try: 49*cda5da8dSAndroid Build Coastguard Worker _Dialect(self) 50*cda5da8dSAndroid Build Coastguard Worker except TypeError as e: 51*cda5da8dSAndroid Build Coastguard Worker # We do this for compatibility with py2.3 52*cda5da8dSAndroid Build Coastguard Worker raise Error(str(e)) 53*cda5da8dSAndroid Build Coastguard Worker 54*cda5da8dSAndroid Build Coastguard Workerclass excel(Dialect): 55*cda5da8dSAndroid Build Coastguard Worker """Describe the usual properties of Excel-generated CSV files.""" 56*cda5da8dSAndroid Build Coastguard Worker delimiter = ',' 57*cda5da8dSAndroid Build Coastguard Worker quotechar = '"' 58*cda5da8dSAndroid Build Coastguard Worker doublequote = True 59*cda5da8dSAndroid Build Coastguard Worker skipinitialspace = False 60*cda5da8dSAndroid Build Coastguard Worker lineterminator = '\r\n' 61*cda5da8dSAndroid Build Coastguard Worker quoting = QUOTE_MINIMAL 62*cda5da8dSAndroid Build Coastguard Workerregister_dialect("excel", excel) 63*cda5da8dSAndroid Build Coastguard Worker 64*cda5da8dSAndroid Build Coastguard Workerclass excel_tab(excel): 65*cda5da8dSAndroid Build Coastguard Worker """Describe the usual properties of Excel-generated TAB-delimited files.""" 66*cda5da8dSAndroid Build Coastguard Worker delimiter = '\t' 67*cda5da8dSAndroid Build Coastguard Workerregister_dialect("excel-tab", excel_tab) 68*cda5da8dSAndroid Build Coastguard Worker 69*cda5da8dSAndroid Build Coastguard Workerclass unix_dialect(Dialect): 70*cda5da8dSAndroid Build Coastguard Worker """Describe the usual properties of Unix-generated CSV files.""" 71*cda5da8dSAndroid Build Coastguard Worker delimiter = ',' 72*cda5da8dSAndroid Build Coastguard Worker quotechar = '"' 73*cda5da8dSAndroid Build Coastguard Worker doublequote = True 74*cda5da8dSAndroid Build Coastguard Worker skipinitialspace = False 75*cda5da8dSAndroid Build Coastguard Worker lineterminator = '\n' 76*cda5da8dSAndroid Build Coastguard Worker quoting = QUOTE_ALL 77*cda5da8dSAndroid Build Coastguard Workerregister_dialect("unix", unix_dialect) 78*cda5da8dSAndroid Build Coastguard Worker 79*cda5da8dSAndroid Build Coastguard Worker 80*cda5da8dSAndroid Build Coastguard Workerclass DictReader: 81*cda5da8dSAndroid Build Coastguard Worker def __init__(self, f, fieldnames=None, restkey=None, restval=None, 82*cda5da8dSAndroid Build Coastguard Worker dialect="excel", *args, **kwds): 83*cda5da8dSAndroid Build Coastguard Worker self._fieldnames = fieldnames # list of keys for the dict 84*cda5da8dSAndroid Build Coastguard Worker self.restkey = restkey # key to catch long rows 85*cda5da8dSAndroid Build Coastguard Worker self.restval = restval # default value for short rows 86*cda5da8dSAndroid Build Coastguard Worker self.reader = reader(f, dialect, *args, **kwds) 87*cda5da8dSAndroid Build Coastguard Worker self.dialect = dialect 88*cda5da8dSAndroid Build Coastguard Worker self.line_num = 0 89*cda5da8dSAndroid Build Coastguard Worker 90*cda5da8dSAndroid Build Coastguard Worker def __iter__(self): 91*cda5da8dSAndroid Build Coastguard Worker return self 92*cda5da8dSAndroid Build Coastguard Worker 93*cda5da8dSAndroid Build Coastguard Worker @property 94*cda5da8dSAndroid Build Coastguard Worker def fieldnames(self): 95*cda5da8dSAndroid Build Coastguard Worker if self._fieldnames is None: 96*cda5da8dSAndroid Build Coastguard Worker try: 97*cda5da8dSAndroid Build Coastguard Worker self._fieldnames = next(self.reader) 98*cda5da8dSAndroid Build Coastguard Worker except StopIteration: 99*cda5da8dSAndroid Build Coastguard Worker pass 100*cda5da8dSAndroid Build Coastguard Worker self.line_num = self.reader.line_num 101*cda5da8dSAndroid Build Coastguard Worker return self._fieldnames 102*cda5da8dSAndroid Build Coastguard Worker 103*cda5da8dSAndroid Build Coastguard Worker @fieldnames.setter 104*cda5da8dSAndroid Build Coastguard Worker def fieldnames(self, value): 105*cda5da8dSAndroid Build Coastguard Worker self._fieldnames = value 106*cda5da8dSAndroid Build Coastguard Worker 107*cda5da8dSAndroid Build Coastguard Worker def __next__(self): 108*cda5da8dSAndroid Build Coastguard Worker if self.line_num == 0: 109*cda5da8dSAndroid Build Coastguard Worker # Used only for its side effect. 110*cda5da8dSAndroid Build Coastguard Worker self.fieldnames 111*cda5da8dSAndroid Build Coastguard Worker row = next(self.reader) 112*cda5da8dSAndroid Build Coastguard Worker self.line_num = self.reader.line_num 113*cda5da8dSAndroid Build Coastguard Worker 114*cda5da8dSAndroid Build Coastguard Worker # unlike the basic reader, we prefer not to return blanks, 115*cda5da8dSAndroid Build Coastguard Worker # because we will typically wind up with a dict full of None 116*cda5da8dSAndroid Build Coastguard Worker # values 117*cda5da8dSAndroid Build Coastguard Worker while row == []: 118*cda5da8dSAndroid Build Coastguard Worker row = next(self.reader) 119*cda5da8dSAndroid Build Coastguard Worker d = dict(zip(self.fieldnames, row)) 120*cda5da8dSAndroid Build Coastguard Worker lf = len(self.fieldnames) 121*cda5da8dSAndroid Build Coastguard Worker lr = len(row) 122*cda5da8dSAndroid Build Coastguard Worker if lf < lr: 123*cda5da8dSAndroid Build Coastguard Worker d[self.restkey] = row[lf:] 124*cda5da8dSAndroid Build Coastguard Worker elif lf > lr: 125*cda5da8dSAndroid Build Coastguard Worker for key in self.fieldnames[lr:]: 126*cda5da8dSAndroid Build Coastguard Worker d[key] = self.restval 127*cda5da8dSAndroid Build Coastguard Worker return d 128*cda5da8dSAndroid Build Coastguard Worker 129*cda5da8dSAndroid Build Coastguard Worker 130*cda5da8dSAndroid Build Coastguard Workerclass DictWriter: 131*cda5da8dSAndroid Build Coastguard Worker def __init__(self, f, fieldnames, restval="", extrasaction="raise", 132*cda5da8dSAndroid Build Coastguard Worker dialect="excel", *args, **kwds): 133*cda5da8dSAndroid Build Coastguard Worker self.fieldnames = fieldnames # list of keys for the dict 134*cda5da8dSAndroid Build Coastguard Worker self.restval = restval # for writing short dicts 135*cda5da8dSAndroid Build Coastguard Worker if extrasaction.lower() not in ("raise", "ignore"): 136*cda5da8dSAndroid Build Coastguard Worker raise ValueError("extrasaction (%s) must be 'raise' or 'ignore'" 137*cda5da8dSAndroid Build Coastguard Worker % extrasaction) 138*cda5da8dSAndroid Build Coastguard Worker self.extrasaction = extrasaction 139*cda5da8dSAndroid Build Coastguard Worker self.writer = writer(f, dialect, *args, **kwds) 140*cda5da8dSAndroid Build Coastguard Worker 141*cda5da8dSAndroid Build Coastguard Worker def writeheader(self): 142*cda5da8dSAndroid Build Coastguard Worker header = dict(zip(self.fieldnames, self.fieldnames)) 143*cda5da8dSAndroid Build Coastguard Worker return self.writerow(header) 144*cda5da8dSAndroid Build Coastguard Worker 145*cda5da8dSAndroid Build Coastguard Worker def _dict_to_list(self, rowdict): 146*cda5da8dSAndroid Build Coastguard Worker if self.extrasaction == "raise": 147*cda5da8dSAndroid Build Coastguard Worker wrong_fields = rowdict.keys() - self.fieldnames 148*cda5da8dSAndroid Build Coastguard Worker if wrong_fields: 149*cda5da8dSAndroid Build Coastguard Worker raise ValueError("dict contains fields not in fieldnames: " 150*cda5da8dSAndroid Build Coastguard Worker + ", ".join([repr(x) for x in wrong_fields])) 151*cda5da8dSAndroid Build Coastguard Worker return (rowdict.get(key, self.restval) for key in self.fieldnames) 152*cda5da8dSAndroid Build Coastguard Worker 153*cda5da8dSAndroid Build Coastguard Worker def writerow(self, rowdict): 154*cda5da8dSAndroid Build Coastguard Worker return self.writer.writerow(self._dict_to_list(rowdict)) 155*cda5da8dSAndroid Build Coastguard Worker 156*cda5da8dSAndroid Build Coastguard Worker def writerows(self, rowdicts): 157*cda5da8dSAndroid Build Coastguard Worker return self.writer.writerows(map(self._dict_to_list, rowdicts)) 158*cda5da8dSAndroid Build Coastguard Worker 159*cda5da8dSAndroid Build Coastguard Worker# Guard Sniffer's type checking against builds that exclude complex() 160*cda5da8dSAndroid Build Coastguard Workertry: 161*cda5da8dSAndroid Build Coastguard Worker complex 162*cda5da8dSAndroid Build Coastguard Workerexcept NameError: 163*cda5da8dSAndroid Build Coastguard Worker complex = float 164*cda5da8dSAndroid Build Coastguard Worker 165*cda5da8dSAndroid Build Coastguard Workerclass Sniffer: 166*cda5da8dSAndroid Build Coastguard Worker ''' 167*cda5da8dSAndroid Build Coastguard Worker "Sniffs" the format of a CSV file (i.e. delimiter, quotechar) 168*cda5da8dSAndroid Build Coastguard Worker Returns a Dialect object. 169*cda5da8dSAndroid Build Coastguard Worker ''' 170*cda5da8dSAndroid Build Coastguard Worker def __init__(self): 171*cda5da8dSAndroid Build Coastguard Worker # in case there is more than one possible delimiter 172*cda5da8dSAndroid Build Coastguard Worker self.preferred = [',', '\t', ';', ' ', ':'] 173*cda5da8dSAndroid Build Coastguard Worker 174*cda5da8dSAndroid Build Coastguard Worker 175*cda5da8dSAndroid Build Coastguard Worker def sniff(self, sample, delimiters=None): 176*cda5da8dSAndroid Build Coastguard Worker """ 177*cda5da8dSAndroid Build Coastguard Worker Returns a dialect (or None) corresponding to the sample 178*cda5da8dSAndroid Build Coastguard Worker """ 179*cda5da8dSAndroid Build Coastguard Worker 180*cda5da8dSAndroid Build Coastguard Worker quotechar, doublequote, delimiter, skipinitialspace = \ 181*cda5da8dSAndroid Build Coastguard Worker self._guess_quote_and_delimiter(sample, delimiters) 182*cda5da8dSAndroid Build Coastguard Worker if not delimiter: 183*cda5da8dSAndroid Build Coastguard Worker delimiter, skipinitialspace = self._guess_delimiter(sample, 184*cda5da8dSAndroid Build Coastguard Worker delimiters) 185*cda5da8dSAndroid Build Coastguard Worker 186*cda5da8dSAndroid Build Coastguard Worker if not delimiter: 187*cda5da8dSAndroid Build Coastguard Worker raise Error("Could not determine delimiter") 188*cda5da8dSAndroid Build Coastguard Worker 189*cda5da8dSAndroid Build Coastguard Worker class dialect(Dialect): 190*cda5da8dSAndroid Build Coastguard Worker _name = "sniffed" 191*cda5da8dSAndroid Build Coastguard Worker lineterminator = '\r\n' 192*cda5da8dSAndroid Build Coastguard Worker quoting = QUOTE_MINIMAL 193*cda5da8dSAndroid Build Coastguard Worker # escapechar = '' 194*cda5da8dSAndroid Build Coastguard Worker 195*cda5da8dSAndroid Build Coastguard Worker dialect.doublequote = doublequote 196*cda5da8dSAndroid Build Coastguard Worker dialect.delimiter = delimiter 197*cda5da8dSAndroid Build Coastguard Worker # _csv.reader won't accept a quotechar of '' 198*cda5da8dSAndroid Build Coastguard Worker dialect.quotechar = quotechar or '"' 199*cda5da8dSAndroid Build Coastguard Worker dialect.skipinitialspace = skipinitialspace 200*cda5da8dSAndroid Build Coastguard Worker 201*cda5da8dSAndroid Build Coastguard Worker return dialect 202*cda5da8dSAndroid Build Coastguard Worker 203*cda5da8dSAndroid Build Coastguard Worker 204*cda5da8dSAndroid Build Coastguard Worker def _guess_quote_and_delimiter(self, data, delimiters): 205*cda5da8dSAndroid Build Coastguard Worker """ 206*cda5da8dSAndroid Build Coastguard Worker Looks for text enclosed between two identical quotes 207*cda5da8dSAndroid Build Coastguard Worker (the probable quotechar) which are preceded and followed 208*cda5da8dSAndroid Build Coastguard Worker by the same character (the probable delimiter). 209*cda5da8dSAndroid Build Coastguard Worker For example: 210*cda5da8dSAndroid Build Coastguard Worker ,'some text', 211*cda5da8dSAndroid Build Coastguard Worker The quote with the most wins, same with the delimiter. 212*cda5da8dSAndroid Build Coastguard Worker If there is no quotechar the delimiter can't be determined 213*cda5da8dSAndroid Build Coastguard Worker this way. 214*cda5da8dSAndroid Build Coastguard Worker """ 215*cda5da8dSAndroid Build Coastguard Worker 216*cda5da8dSAndroid Build Coastguard Worker matches = [] 217*cda5da8dSAndroid Build Coastguard Worker for restr in (r'(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?", 218*cda5da8dSAndroid Build Coastguard Worker r'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?", 219*cda5da8dSAndroid Build Coastguard Worker r'(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?" 220*cda5da8dSAndroid Build Coastguard Worker r'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space) 221*cda5da8dSAndroid Build Coastguard Worker regexp = re.compile(restr, re.DOTALL | re.MULTILINE) 222*cda5da8dSAndroid Build Coastguard Worker matches = regexp.findall(data) 223*cda5da8dSAndroid Build Coastguard Worker if matches: 224*cda5da8dSAndroid Build Coastguard Worker break 225*cda5da8dSAndroid Build Coastguard Worker 226*cda5da8dSAndroid Build Coastguard Worker if not matches: 227*cda5da8dSAndroid Build Coastguard Worker # (quotechar, doublequote, delimiter, skipinitialspace) 228*cda5da8dSAndroid Build Coastguard Worker return ('', False, None, 0) 229*cda5da8dSAndroid Build Coastguard Worker quotes = {} 230*cda5da8dSAndroid Build Coastguard Worker delims = {} 231*cda5da8dSAndroid Build Coastguard Worker spaces = 0 232*cda5da8dSAndroid Build Coastguard Worker groupindex = regexp.groupindex 233*cda5da8dSAndroid Build Coastguard Worker for m in matches: 234*cda5da8dSAndroid Build Coastguard Worker n = groupindex['quote'] - 1 235*cda5da8dSAndroid Build Coastguard Worker key = m[n] 236*cda5da8dSAndroid Build Coastguard Worker if key: 237*cda5da8dSAndroid Build Coastguard Worker quotes[key] = quotes.get(key, 0) + 1 238*cda5da8dSAndroid Build Coastguard Worker try: 239*cda5da8dSAndroid Build Coastguard Worker n = groupindex['delim'] - 1 240*cda5da8dSAndroid Build Coastguard Worker key = m[n] 241*cda5da8dSAndroid Build Coastguard Worker except KeyError: 242*cda5da8dSAndroid Build Coastguard Worker continue 243*cda5da8dSAndroid Build Coastguard Worker if key and (delimiters is None or key in delimiters): 244*cda5da8dSAndroid Build Coastguard Worker delims[key] = delims.get(key, 0) + 1 245*cda5da8dSAndroid Build Coastguard Worker try: 246*cda5da8dSAndroid Build Coastguard Worker n = groupindex['space'] - 1 247*cda5da8dSAndroid Build Coastguard Worker except KeyError: 248*cda5da8dSAndroid Build Coastguard Worker continue 249*cda5da8dSAndroid Build Coastguard Worker if m[n]: 250*cda5da8dSAndroid Build Coastguard Worker spaces += 1 251*cda5da8dSAndroid Build Coastguard Worker 252*cda5da8dSAndroid Build Coastguard Worker quotechar = max(quotes, key=quotes.get) 253*cda5da8dSAndroid Build Coastguard Worker 254*cda5da8dSAndroid Build Coastguard Worker if delims: 255*cda5da8dSAndroid Build Coastguard Worker delim = max(delims, key=delims.get) 256*cda5da8dSAndroid Build Coastguard Worker skipinitialspace = delims[delim] == spaces 257*cda5da8dSAndroid Build Coastguard Worker if delim == '\n': # most likely a file with a single column 258*cda5da8dSAndroid Build Coastguard Worker delim = '' 259*cda5da8dSAndroid Build Coastguard Worker else: 260*cda5da8dSAndroid Build Coastguard Worker # there is *no* delimiter, it's a single column of quoted data 261*cda5da8dSAndroid Build Coastguard Worker delim = '' 262*cda5da8dSAndroid Build Coastguard Worker skipinitialspace = 0 263*cda5da8dSAndroid Build Coastguard Worker 264*cda5da8dSAndroid Build Coastguard Worker # if we see an extra quote between delimiters, we've got a 265*cda5da8dSAndroid Build Coastguard Worker # double quoted format 266*cda5da8dSAndroid Build Coastguard Worker dq_regexp = re.compile( 267*cda5da8dSAndroid Build Coastguard Worker r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)" % \ 268*cda5da8dSAndroid Build Coastguard Worker {'delim':re.escape(delim), 'quote':quotechar}, re.MULTILINE) 269*cda5da8dSAndroid Build Coastguard Worker 270*cda5da8dSAndroid Build Coastguard Worker 271*cda5da8dSAndroid Build Coastguard Worker 272*cda5da8dSAndroid Build Coastguard Worker if dq_regexp.search(data): 273*cda5da8dSAndroid Build Coastguard Worker doublequote = True 274*cda5da8dSAndroid Build Coastguard Worker else: 275*cda5da8dSAndroid Build Coastguard Worker doublequote = False 276*cda5da8dSAndroid Build Coastguard Worker 277*cda5da8dSAndroid Build Coastguard Worker return (quotechar, doublequote, delim, skipinitialspace) 278*cda5da8dSAndroid Build Coastguard Worker 279*cda5da8dSAndroid Build Coastguard Worker 280*cda5da8dSAndroid Build Coastguard Worker def _guess_delimiter(self, data, delimiters): 281*cda5da8dSAndroid Build Coastguard Worker """ 282*cda5da8dSAndroid Build Coastguard Worker The delimiter /should/ occur the same number of times on 283*cda5da8dSAndroid Build Coastguard Worker each row. However, due to malformed data, it may not. We don't want 284*cda5da8dSAndroid Build Coastguard Worker an all or nothing approach, so we allow for small variations in this 285*cda5da8dSAndroid Build Coastguard Worker number. 286*cda5da8dSAndroid Build Coastguard Worker 1) build a table of the frequency of each character on every line. 287*cda5da8dSAndroid Build Coastguard Worker 2) build a table of frequencies of this frequency (meta-frequency?), 288*cda5da8dSAndroid Build Coastguard Worker e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows, 289*cda5da8dSAndroid Build Coastguard Worker 7 times in 2 rows' 290*cda5da8dSAndroid Build Coastguard Worker 3) use the mode of the meta-frequency to determine the /expected/ 291*cda5da8dSAndroid Build Coastguard Worker frequency for that character 292*cda5da8dSAndroid Build Coastguard Worker 4) find out how often the character actually meets that goal 293*cda5da8dSAndroid Build Coastguard Worker 5) the character that best meets its goal is the delimiter 294*cda5da8dSAndroid Build Coastguard Worker For performance reasons, the data is evaluated in chunks, so it can 295*cda5da8dSAndroid Build Coastguard Worker try and evaluate the smallest portion of the data possible, evaluating 296*cda5da8dSAndroid Build Coastguard Worker additional chunks as necessary. 297*cda5da8dSAndroid Build Coastguard Worker """ 298*cda5da8dSAndroid Build Coastguard Worker 299*cda5da8dSAndroid Build Coastguard Worker data = list(filter(None, data.split('\n'))) 300*cda5da8dSAndroid Build Coastguard Worker 301*cda5da8dSAndroid Build Coastguard Worker ascii = [chr(c) for c in range(127)] # 7-bit ASCII 302*cda5da8dSAndroid Build Coastguard Worker 303*cda5da8dSAndroid Build Coastguard Worker # build frequency tables 304*cda5da8dSAndroid Build Coastguard Worker chunkLength = min(10, len(data)) 305*cda5da8dSAndroid Build Coastguard Worker iteration = 0 306*cda5da8dSAndroid Build Coastguard Worker charFrequency = {} 307*cda5da8dSAndroid Build Coastguard Worker modes = {} 308*cda5da8dSAndroid Build Coastguard Worker delims = {} 309*cda5da8dSAndroid Build Coastguard Worker start, end = 0, chunkLength 310*cda5da8dSAndroid Build Coastguard Worker while start < len(data): 311*cda5da8dSAndroid Build Coastguard Worker iteration += 1 312*cda5da8dSAndroid Build Coastguard Worker for line in data[start:end]: 313*cda5da8dSAndroid Build Coastguard Worker for char in ascii: 314*cda5da8dSAndroid Build Coastguard Worker metaFrequency = charFrequency.get(char, {}) 315*cda5da8dSAndroid Build Coastguard Worker # must count even if frequency is 0 316*cda5da8dSAndroid Build Coastguard Worker freq = line.count(char) 317*cda5da8dSAndroid Build Coastguard Worker # value is the mode 318*cda5da8dSAndroid Build Coastguard Worker metaFrequency[freq] = metaFrequency.get(freq, 0) + 1 319*cda5da8dSAndroid Build Coastguard Worker charFrequency[char] = metaFrequency 320*cda5da8dSAndroid Build Coastguard Worker 321*cda5da8dSAndroid Build Coastguard Worker for char in charFrequency.keys(): 322*cda5da8dSAndroid Build Coastguard Worker items = list(charFrequency[char].items()) 323*cda5da8dSAndroid Build Coastguard Worker if len(items) == 1 and items[0][0] == 0: 324*cda5da8dSAndroid Build Coastguard Worker continue 325*cda5da8dSAndroid Build Coastguard Worker # get the mode of the frequencies 326*cda5da8dSAndroid Build Coastguard Worker if len(items) > 1: 327*cda5da8dSAndroid Build Coastguard Worker modes[char] = max(items, key=lambda x: x[1]) 328*cda5da8dSAndroid Build Coastguard Worker # adjust the mode - subtract the sum of all 329*cda5da8dSAndroid Build Coastguard Worker # other frequencies 330*cda5da8dSAndroid Build Coastguard Worker items.remove(modes[char]) 331*cda5da8dSAndroid Build Coastguard Worker modes[char] = (modes[char][0], modes[char][1] 332*cda5da8dSAndroid Build Coastguard Worker - sum(item[1] for item in items)) 333*cda5da8dSAndroid Build Coastguard Worker else: 334*cda5da8dSAndroid Build Coastguard Worker modes[char] = items[0] 335*cda5da8dSAndroid Build Coastguard Worker 336*cda5da8dSAndroid Build Coastguard Worker # build a list of possible delimiters 337*cda5da8dSAndroid Build Coastguard Worker modeList = modes.items() 338*cda5da8dSAndroid Build Coastguard Worker total = float(min(chunkLength * iteration, len(data))) 339*cda5da8dSAndroid Build Coastguard Worker # (rows of consistent data) / (number of rows) = 100% 340*cda5da8dSAndroid Build Coastguard Worker consistency = 1.0 341*cda5da8dSAndroid Build Coastguard Worker # minimum consistency threshold 342*cda5da8dSAndroid Build Coastguard Worker threshold = 0.9 343*cda5da8dSAndroid Build Coastguard Worker while len(delims) == 0 and consistency >= threshold: 344*cda5da8dSAndroid Build Coastguard Worker for k, v in modeList: 345*cda5da8dSAndroid Build Coastguard Worker if v[0] > 0 and v[1] > 0: 346*cda5da8dSAndroid Build Coastguard Worker if ((v[1]/total) >= consistency and 347*cda5da8dSAndroid Build Coastguard Worker (delimiters is None or k in delimiters)): 348*cda5da8dSAndroid Build Coastguard Worker delims[k] = v 349*cda5da8dSAndroid Build Coastguard Worker consistency -= 0.01 350*cda5da8dSAndroid Build Coastguard Worker 351*cda5da8dSAndroid Build Coastguard Worker if len(delims) == 1: 352*cda5da8dSAndroid Build Coastguard Worker delim = list(delims.keys())[0] 353*cda5da8dSAndroid Build Coastguard Worker skipinitialspace = (data[0].count(delim) == 354*cda5da8dSAndroid Build Coastguard Worker data[0].count("%c " % delim)) 355*cda5da8dSAndroid Build Coastguard Worker return (delim, skipinitialspace) 356*cda5da8dSAndroid Build Coastguard Worker 357*cda5da8dSAndroid Build Coastguard Worker # analyze another chunkLength lines 358*cda5da8dSAndroid Build Coastguard Worker start = end 359*cda5da8dSAndroid Build Coastguard Worker end += chunkLength 360*cda5da8dSAndroid Build Coastguard Worker 361*cda5da8dSAndroid Build Coastguard Worker if not delims: 362*cda5da8dSAndroid Build Coastguard Worker return ('', 0) 363*cda5da8dSAndroid Build Coastguard Worker 364*cda5da8dSAndroid Build Coastguard Worker # if there's more than one, fall back to a 'preferred' list 365*cda5da8dSAndroid Build Coastguard Worker if len(delims) > 1: 366*cda5da8dSAndroid Build Coastguard Worker for d in self.preferred: 367*cda5da8dSAndroid Build Coastguard Worker if d in delims.keys(): 368*cda5da8dSAndroid Build Coastguard Worker skipinitialspace = (data[0].count(d) == 369*cda5da8dSAndroid Build Coastguard Worker data[0].count("%c " % d)) 370*cda5da8dSAndroid Build Coastguard Worker return (d, skipinitialspace) 371*cda5da8dSAndroid Build Coastguard Worker 372*cda5da8dSAndroid Build Coastguard Worker # nothing else indicates a preference, pick the character that 373*cda5da8dSAndroid Build Coastguard Worker # dominates(?) 374*cda5da8dSAndroid Build Coastguard Worker items = [(v,k) for (k,v) in delims.items()] 375*cda5da8dSAndroid Build Coastguard Worker items.sort() 376*cda5da8dSAndroid Build Coastguard Worker delim = items[-1][1] 377*cda5da8dSAndroid Build Coastguard Worker 378*cda5da8dSAndroid Build Coastguard Worker skipinitialspace = (data[0].count(delim) == 379*cda5da8dSAndroid Build Coastguard Worker data[0].count("%c " % delim)) 380*cda5da8dSAndroid Build Coastguard Worker return (delim, skipinitialspace) 381*cda5da8dSAndroid Build Coastguard Worker 382*cda5da8dSAndroid Build Coastguard Worker 383*cda5da8dSAndroid Build Coastguard Worker def has_header(self, sample): 384*cda5da8dSAndroid Build Coastguard Worker # Creates a dictionary of types of data in each column. If any 385*cda5da8dSAndroid Build Coastguard Worker # column is of a single type (say, integers), *except* for the first 386*cda5da8dSAndroid Build Coastguard Worker # row, then the first row is presumed to be labels. If the type 387*cda5da8dSAndroid Build Coastguard Worker # can't be determined, it is assumed to be a string in which case 388*cda5da8dSAndroid Build Coastguard Worker # the length of the string is the determining factor: if all of the 389*cda5da8dSAndroid Build Coastguard Worker # rows except for the first are the same length, it's a header. 390*cda5da8dSAndroid Build Coastguard Worker # Finally, a 'vote' is taken at the end for each column, adding or 391*cda5da8dSAndroid Build Coastguard Worker # subtracting from the likelihood of the first row being a header. 392*cda5da8dSAndroid Build Coastguard Worker 393*cda5da8dSAndroid Build Coastguard Worker rdr = reader(StringIO(sample), self.sniff(sample)) 394*cda5da8dSAndroid Build Coastguard Worker 395*cda5da8dSAndroid Build Coastguard Worker header = next(rdr) # assume first row is header 396*cda5da8dSAndroid Build Coastguard Worker 397*cda5da8dSAndroid Build Coastguard Worker columns = len(header) 398*cda5da8dSAndroid Build Coastguard Worker columnTypes = {} 399*cda5da8dSAndroid Build Coastguard Worker for i in range(columns): columnTypes[i] = None 400*cda5da8dSAndroid Build Coastguard Worker 401*cda5da8dSAndroid Build Coastguard Worker checked = 0 402*cda5da8dSAndroid Build Coastguard Worker for row in rdr: 403*cda5da8dSAndroid Build Coastguard Worker # arbitrary number of rows to check, to keep it sane 404*cda5da8dSAndroid Build Coastguard Worker if checked > 20: 405*cda5da8dSAndroid Build Coastguard Worker break 406*cda5da8dSAndroid Build Coastguard Worker checked += 1 407*cda5da8dSAndroid Build Coastguard Worker 408*cda5da8dSAndroid Build Coastguard Worker if len(row) != columns: 409*cda5da8dSAndroid Build Coastguard Worker continue # skip rows that have irregular number of columns 410*cda5da8dSAndroid Build Coastguard Worker 411*cda5da8dSAndroid Build Coastguard Worker for col in list(columnTypes.keys()): 412*cda5da8dSAndroid Build Coastguard Worker thisType = complex 413*cda5da8dSAndroid Build Coastguard Worker try: 414*cda5da8dSAndroid Build Coastguard Worker thisType(row[col]) 415*cda5da8dSAndroid Build Coastguard Worker except (ValueError, OverflowError): 416*cda5da8dSAndroid Build Coastguard Worker # fallback to length of string 417*cda5da8dSAndroid Build Coastguard Worker thisType = len(row[col]) 418*cda5da8dSAndroid Build Coastguard Worker 419*cda5da8dSAndroid Build Coastguard Worker if thisType != columnTypes[col]: 420*cda5da8dSAndroid Build Coastguard Worker if columnTypes[col] is None: # add new column type 421*cda5da8dSAndroid Build Coastguard Worker columnTypes[col] = thisType 422*cda5da8dSAndroid Build Coastguard Worker else: 423*cda5da8dSAndroid Build Coastguard Worker # type is inconsistent, remove column from 424*cda5da8dSAndroid Build Coastguard Worker # consideration 425*cda5da8dSAndroid Build Coastguard Worker del columnTypes[col] 426*cda5da8dSAndroid Build Coastguard Worker 427*cda5da8dSAndroid Build Coastguard Worker # finally, compare results against first row and "vote" 428*cda5da8dSAndroid Build Coastguard Worker # on whether it's a header 429*cda5da8dSAndroid Build Coastguard Worker hasHeader = 0 430*cda5da8dSAndroid Build Coastguard Worker for col, colType in columnTypes.items(): 431*cda5da8dSAndroid Build Coastguard Worker if type(colType) == type(0): # it's a length 432*cda5da8dSAndroid Build Coastguard Worker if len(header[col]) != colType: 433*cda5da8dSAndroid Build Coastguard Worker hasHeader += 1 434*cda5da8dSAndroid Build Coastguard Worker else: 435*cda5da8dSAndroid Build Coastguard Worker hasHeader -= 1 436*cda5da8dSAndroid Build Coastguard Worker else: # attempt typecast 437*cda5da8dSAndroid Build Coastguard Worker try: 438*cda5da8dSAndroid Build Coastguard Worker colType(header[col]) 439*cda5da8dSAndroid Build Coastguard Worker except (ValueError, TypeError): 440*cda5da8dSAndroid Build Coastguard Worker hasHeader += 1 441*cda5da8dSAndroid Build Coastguard Worker else: 442*cda5da8dSAndroid Build Coastguard Worker hasHeader -= 1 443*cda5da8dSAndroid Build Coastguard Worker 444*cda5da8dSAndroid Build Coastguard Worker return hasHeader > 0 445