xref: /aosp_15_r20/prebuilts/build-tools/common/py3-stdlib/csv.py (revision cda5da8d549138a6648c5ee6d7a49cf8f4a657be)
1*cda5da8dSAndroid Build Coastguard Worker
2*cda5da8dSAndroid Build Coastguard Worker"""
3*cda5da8dSAndroid Build Coastguard Workercsv.py - read/write/investigate CSV files
4*cda5da8dSAndroid Build Coastguard Worker"""
5*cda5da8dSAndroid Build Coastguard Worker
6*cda5da8dSAndroid Build Coastguard Workerimport re
7*cda5da8dSAndroid Build Coastguard Workerfrom _csv import Error, __version__, writer, reader, register_dialect, \
8*cda5da8dSAndroid Build Coastguard Worker                 unregister_dialect, get_dialect, list_dialects, \
9*cda5da8dSAndroid Build Coastguard Worker                 field_size_limit, \
10*cda5da8dSAndroid Build Coastguard Worker                 QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
11*cda5da8dSAndroid Build Coastguard Worker                 __doc__
12*cda5da8dSAndroid Build Coastguard Workerfrom _csv import Dialect as _Dialect
13*cda5da8dSAndroid Build Coastguard Worker
14*cda5da8dSAndroid Build Coastguard Workerfrom io import StringIO
15*cda5da8dSAndroid Build Coastguard Worker
16*cda5da8dSAndroid Build Coastguard Worker__all__ = ["QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
17*cda5da8dSAndroid Build Coastguard Worker           "Error", "Dialect", "__doc__", "excel", "excel_tab",
18*cda5da8dSAndroid Build Coastguard Worker           "field_size_limit", "reader", "writer",
19*cda5da8dSAndroid Build Coastguard Worker           "register_dialect", "get_dialect", "list_dialects", "Sniffer",
20*cda5da8dSAndroid Build Coastguard Worker           "unregister_dialect", "__version__", "DictReader", "DictWriter",
21*cda5da8dSAndroid Build Coastguard Worker           "unix_dialect"]
22*cda5da8dSAndroid Build Coastguard Worker
23*cda5da8dSAndroid Build Coastguard Workerclass Dialect:
24*cda5da8dSAndroid Build Coastguard Worker    """Describe a CSV dialect.
25*cda5da8dSAndroid Build Coastguard Worker
26*cda5da8dSAndroid Build Coastguard Worker    This must be subclassed (see csv.excel).  Valid attributes are:
27*cda5da8dSAndroid Build Coastguard Worker    delimiter, quotechar, escapechar, doublequote, skipinitialspace,
28*cda5da8dSAndroid Build Coastguard Worker    lineterminator, quoting.
29*cda5da8dSAndroid Build Coastguard Worker
30*cda5da8dSAndroid Build Coastguard Worker    """
31*cda5da8dSAndroid Build Coastguard Worker    _name = ""
32*cda5da8dSAndroid Build Coastguard Worker    _valid = False
33*cda5da8dSAndroid Build Coastguard Worker    # placeholders
34*cda5da8dSAndroid Build Coastguard Worker    delimiter = None
35*cda5da8dSAndroid Build Coastguard Worker    quotechar = None
36*cda5da8dSAndroid Build Coastguard Worker    escapechar = None
37*cda5da8dSAndroid Build Coastguard Worker    doublequote = None
38*cda5da8dSAndroid Build Coastguard Worker    skipinitialspace = None
39*cda5da8dSAndroid Build Coastguard Worker    lineterminator = None
40*cda5da8dSAndroid Build Coastguard Worker    quoting = None
41*cda5da8dSAndroid Build Coastguard Worker
42*cda5da8dSAndroid Build Coastguard Worker    def __init__(self):
43*cda5da8dSAndroid Build Coastguard Worker        if self.__class__ != Dialect:
44*cda5da8dSAndroid Build Coastguard Worker            self._valid = True
45*cda5da8dSAndroid Build Coastguard Worker        self._validate()
46*cda5da8dSAndroid Build Coastguard Worker
47*cda5da8dSAndroid Build Coastguard Worker    def _validate(self):
48*cda5da8dSAndroid Build Coastguard Worker        try:
49*cda5da8dSAndroid Build Coastguard Worker            _Dialect(self)
50*cda5da8dSAndroid Build Coastguard Worker        except TypeError as e:
51*cda5da8dSAndroid Build Coastguard Worker            # We do this for compatibility with py2.3
52*cda5da8dSAndroid Build Coastguard Worker            raise Error(str(e))
53*cda5da8dSAndroid Build Coastguard Worker
54*cda5da8dSAndroid Build Coastguard Workerclass excel(Dialect):
55*cda5da8dSAndroid Build Coastguard Worker    """Describe the usual properties of Excel-generated CSV files."""
56*cda5da8dSAndroid Build Coastguard Worker    delimiter = ','
57*cda5da8dSAndroid Build Coastguard Worker    quotechar = '"'
58*cda5da8dSAndroid Build Coastguard Worker    doublequote = True
59*cda5da8dSAndroid Build Coastguard Worker    skipinitialspace = False
60*cda5da8dSAndroid Build Coastguard Worker    lineterminator = '\r\n'
61*cda5da8dSAndroid Build Coastguard Worker    quoting = QUOTE_MINIMAL
62*cda5da8dSAndroid Build Coastguard Workerregister_dialect("excel", excel)
63*cda5da8dSAndroid Build Coastguard Worker
64*cda5da8dSAndroid Build Coastguard Workerclass excel_tab(excel):
65*cda5da8dSAndroid Build Coastguard Worker    """Describe the usual properties of Excel-generated TAB-delimited files."""
66*cda5da8dSAndroid Build Coastguard Worker    delimiter = '\t'
67*cda5da8dSAndroid Build Coastguard Workerregister_dialect("excel-tab", excel_tab)
68*cda5da8dSAndroid Build Coastguard Worker
69*cda5da8dSAndroid Build Coastguard Workerclass unix_dialect(Dialect):
70*cda5da8dSAndroid Build Coastguard Worker    """Describe the usual properties of Unix-generated CSV files."""
71*cda5da8dSAndroid Build Coastguard Worker    delimiter = ','
72*cda5da8dSAndroid Build Coastguard Worker    quotechar = '"'
73*cda5da8dSAndroid Build Coastguard Worker    doublequote = True
74*cda5da8dSAndroid Build Coastguard Worker    skipinitialspace = False
75*cda5da8dSAndroid Build Coastguard Worker    lineterminator = '\n'
76*cda5da8dSAndroid Build Coastguard Worker    quoting = QUOTE_ALL
77*cda5da8dSAndroid Build Coastguard Workerregister_dialect("unix", unix_dialect)
78*cda5da8dSAndroid Build Coastguard Worker
79*cda5da8dSAndroid Build Coastguard Worker
80*cda5da8dSAndroid Build Coastguard Workerclass DictReader:
81*cda5da8dSAndroid Build Coastguard Worker    def __init__(self, f, fieldnames=None, restkey=None, restval=None,
82*cda5da8dSAndroid Build Coastguard Worker                 dialect="excel", *args, **kwds):
83*cda5da8dSAndroid Build Coastguard Worker        self._fieldnames = fieldnames   # list of keys for the dict
84*cda5da8dSAndroid Build Coastguard Worker        self.restkey = restkey          # key to catch long rows
85*cda5da8dSAndroid Build Coastguard Worker        self.restval = restval          # default value for short rows
86*cda5da8dSAndroid Build Coastguard Worker        self.reader = reader(f, dialect, *args, **kwds)
87*cda5da8dSAndroid Build Coastguard Worker        self.dialect = dialect
88*cda5da8dSAndroid Build Coastguard Worker        self.line_num = 0
89*cda5da8dSAndroid Build Coastguard Worker
90*cda5da8dSAndroid Build Coastguard Worker    def __iter__(self):
91*cda5da8dSAndroid Build Coastguard Worker        return self
92*cda5da8dSAndroid Build Coastguard Worker
93*cda5da8dSAndroid Build Coastguard Worker    @property
94*cda5da8dSAndroid Build Coastguard Worker    def fieldnames(self):
95*cda5da8dSAndroid Build Coastguard Worker        if self._fieldnames is None:
96*cda5da8dSAndroid Build Coastguard Worker            try:
97*cda5da8dSAndroid Build Coastguard Worker                self._fieldnames = next(self.reader)
98*cda5da8dSAndroid Build Coastguard Worker            except StopIteration:
99*cda5da8dSAndroid Build Coastguard Worker                pass
100*cda5da8dSAndroid Build Coastguard Worker        self.line_num = self.reader.line_num
101*cda5da8dSAndroid Build Coastguard Worker        return self._fieldnames
102*cda5da8dSAndroid Build Coastguard Worker
103*cda5da8dSAndroid Build Coastguard Worker    @fieldnames.setter
104*cda5da8dSAndroid Build Coastguard Worker    def fieldnames(self, value):
105*cda5da8dSAndroid Build Coastguard Worker        self._fieldnames = value
106*cda5da8dSAndroid Build Coastguard Worker
107*cda5da8dSAndroid Build Coastguard Worker    def __next__(self):
108*cda5da8dSAndroid Build Coastguard Worker        if self.line_num == 0:
109*cda5da8dSAndroid Build Coastguard Worker            # Used only for its side effect.
110*cda5da8dSAndroid Build Coastguard Worker            self.fieldnames
111*cda5da8dSAndroid Build Coastguard Worker        row = next(self.reader)
112*cda5da8dSAndroid Build Coastguard Worker        self.line_num = self.reader.line_num
113*cda5da8dSAndroid Build Coastguard Worker
114*cda5da8dSAndroid Build Coastguard Worker        # unlike the basic reader, we prefer not to return blanks,
115*cda5da8dSAndroid Build Coastguard Worker        # because we will typically wind up with a dict full of None
116*cda5da8dSAndroid Build Coastguard Worker        # values
117*cda5da8dSAndroid Build Coastguard Worker        while row == []:
118*cda5da8dSAndroid Build Coastguard Worker            row = next(self.reader)
119*cda5da8dSAndroid Build Coastguard Worker        d = dict(zip(self.fieldnames, row))
120*cda5da8dSAndroid Build Coastguard Worker        lf = len(self.fieldnames)
121*cda5da8dSAndroid Build Coastguard Worker        lr = len(row)
122*cda5da8dSAndroid Build Coastguard Worker        if lf < lr:
123*cda5da8dSAndroid Build Coastguard Worker            d[self.restkey] = row[lf:]
124*cda5da8dSAndroid Build Coastguard Worker        elif lf > lr:
125*cda5da8dSAndroid Build Coastguard Worker            for key in self.fieldnames[lr:]:
126*cda5da8dSAndroid Build Coastguard Worker                d[key] = self.restval
127*cda5da8dSAndroid Build Coastguard Worker        return d
128*cda5da8dSAndroid Build Coastguard Worker
129*cda5da8dSAndroid Build Coastguard Worker
130*cda5da8dSAndroid Build Coastguard Workerclass DictWriter:
131*cda5da8dSAndroid Build Coastguard Worker    def __init__(self, f, fieldnames, restval="", extrasaction="raise",
132*cda5da8dSAndroid Build Coastguard Worker                 dialect="excel", *args, **kwds):
133*cda5da8dSAndroid Build Coastguard Worker        self.fieldnames = fieldnames    # list of keys for the dict
134*cda5da8dSAndroid Build Coastguard Worker        self.restval = restval          # for writing short dicts
135*cda5da8dSAndroid Build Coastguard Worker        if extrasaction.lower() not in ("raise", "ignore"):
136*cda5da8dSAndroid Build Coastguard Worker            raise ValueError("extrasaction (%s) must be 'raise' or 'ignore'"
137*cda5da8dSAndroid Build Coastguard Worker                             % extrasaction)
138*cda5da8dSAndroid Build Coastguard Worker        self.extrasaction = extrasaction
139*cda5da8dSAndroid Build Coastguard Worker        self.writer = writer(f, dialect, *args, **kwds)
140*cda5da8dSAndroid Build Coastguard Worker
141*cda5da8dSAndroid Build Coastguard Worker    def writeheader(self):
142*cda5da8dSAndroid Build Coastguard Worker        header = dict(zip(self.fieldnames, self.fieldnames))
143*cda5da8dSAndroid Build Coastguard Worker        return self.writerow(header)
144*cda5da8dSAndroid Build Coastguard Worker
145*cda5da8dSAndroid Build Coastguard Worker    def _dict_to_list(self, rowdict):
146*cda5da8dSAndroid Build Coastguard Worker        if self.extrasaction == "raise":
147*cda5da8dSAndroid Build Coastguard Worker            wrong_fields = rowdict.keys() - self.fieldnames
148*cda5da8dSAndroid Build Coastguard Worker            if wrong_fields:
149*cda5da8dSAndroid Build Coastguard Worker                raise ValueError("dict contains fields not in fieldnames: "
150*cda5da8dSAndroid Build Coastguard Worker                                 + ", ".join([repr(x) for x in wrong_fields]))
151*cda5da8dSAndroid Build Coastguard Worker        return (rowdict.get(key, self.restval) for key in self.fieldnames)
152*cda5da8dSAndroid Build Coastguard Worker
153*cda5da8dSAndroid Build Coastguard Worker    def writerow(self, rowdict):
154*cda5da8dSAndroid Build Coastguard Worker        return self.writer.writerow(self._dict_to_list(rowdict))
155*cda5da8dSAndroid Build Coastguard Worker
156*cda5da8dSAndroid Build Coastguard Worker    def writerows(self, rowdicts):
157*cda5da8dSAndroid Build Coastguard Worker        return self.writer.writerows(map(self._dict_to_list, rowdicts))
158*cda5da8dSAndroid Build Coastguard Worker
159*cda5da8dSAndroid Build Coastguard Worker# Guard Sniffer's type checking against builds that exclude complex()
160*cda5da8dSAndroid Build Coastguard Workertry:
161*cda5da8dSAndroid Build Coastguard Worker    complex
162*cda5da8dSAndroid Build Coastguard Workerexcept NameError:
163*cda5da8dSAndroid Build Coastguard Worker    complex = float
164*cda5da8dSAndroid Build Coastguard Worker
165*cda5da8dSAndroid Build Coastguard Workerclass Sniffer:
166*cda5da8dSAndroid Build Coastguard Worker    '''
167*cda5da8dSAndroid Build Coastguard Worker    "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
168*cda5da8dSAndroid Build Coastguard Worker    Returns a Dialect object.
169*cda5da8dSAndroid Build Coastguard Worker    '''
170*cda5da8dSAndroid Build Coastguard Worker    def __init__(self):
171*cda5da8dSAndroid Build Coastguard Worker        # in case there is more than one possible delimiter
172*cda5da8dSAndroid Build Coastguard Worker        self.preferred = [',', '\t', ';', ' ', ':']
173*cda5da8dSAndroid Build Coastguard Worker
174*cda5da8dSAndroid Build Coastguard Worker
175*cda5da8dSAndroid Build Coastguard Worker    def sniff(self, sample, delimiters=None):
176*cda5da8dSAndroid Build Coastguard Worker        """
177*cda5da8dSAndroid Build Coastguard Worker        Returns a dialect (or None) corresponding to the sample
178*cda5da8dSAndroid Build Coastguard Worker        """
179*cda5da8dSAndroid Build Coastguard Worker
180*cda5da8dSAndroid Build Coastguard Worker        quotechar, doublequote, delimiter, skipinitialspace = \
181*cda5da8dSAndroid Build Coastguard Worker                   self._guess_quote_and_delimiter(sample, delimiters)
182*cda5da8dSAndroid Build Coastguard Worker        if not delimiter:
183*cda5da8dSAndroid Build Coastguard Worker            delimiter, skipinitialspace = self._guess_delimiter(sample,
184*cda5da8dSAndroid Build Coastguard Worker                                                                delimiters)
185*cda5da8dSAndroid Build Coastguard Worker
186*cda5da8dSAndroid Build Coastguard Worker        if not delimiter:
187*cda5da8dSAndroid Build Coastguard Worker            raise Error("Could not determine delimiter")
188*cda5da8dSAndroid Build Coastguard Worker
189*cda5da8dSAndroid Build Coastguard Worker        class dialect(Dialect):
190*cda5da8dSAndroid Build Coastguard Worker            _name = "sniffed"
191*cda5da8dSAndroid Build Coastguard Worker            lineterminator = '\r\n'
192*cda5da8dSAndroid Build Coastguard Worker            quoting = QUOTE_MINIMAL
193*cda5da8dSAndroid Build Coastguard Worker            # escapechar = ''
194*cda5da8dSAndroid Build Coastguard Worker
195*cda5da8dSAndroid Build Coastguard Worker        dialect.doublequote = doublequote
196*cda5da8dSAndroid Build Coastguard Worker        dialect.delimiter = delimiter
197*cda5da8dSAndroid Build Coastguard Worker        # _csv.reader won't accept a quotechar of ''
198*cda5da8dSAndroid Build Coastguard Worker        dialect.quotechar = quotechar or '"'
199*cda5da8dSAndroid Build Coastguard Worker        dialect.skipinitialspace = skipinitialspace
200*cda5da8dSAndroid Build Coastguard Worker
201*cda5da8dSAndroid Build Coastguard Worker        return dialect
202*cda5da8dSAndroid Build Coastguard Worker
203*cda5da8dSAndroid Build Coastguard Worker
204*cda5da8dSAndroid Build Coastguard Worker    def _guess_quote_and_delimiter(self, data, delimiters):
205*cda5da8dSAndroid Build Coastguard Worker        """
206*cda5da8dSAndroid Build Coastguard Worker        Looks for text enclosed between two identical quotes
207*cda5da8dSAndroid Build Coastguard Worker        (the probable quotechar) which are preceded and followed
208*cda5da8dSAndroid Build Coastguard Worker        by the same character (the probable delimiter).
209*cda5da8dSAndroid Build Coastguard Worker        For example:
210*cda5da8dSAndroid Build Coastguard Worker                         ,'some text',
211*cda5da8dSAndroid Build Coastguard Worker        The quote with the most wins, same with the delimiter.
212*cda5da8dSAndroid Build Coastguard Worker        If there is no quotechar the delimiter can't be determined
213*cda5da8dSAndroid Build Coastguard Worker        this way.
214*cda5da8dSAndroid Build Coastguard Worker        """
215*cda5da8dSAndroid Build Coastguard Worker
216*cda5da8dSAndroid Build Coastguard Worker        matches = []
217*cda5da8dSAndroid Build Coastguard Worker        for restr in (r'(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
218*cda5da8dSAndroid Build Coastguard Worker                      r'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)',   #  ".*?",
219*cda5da8dSAndroid Build Coastguard Worker                      r'(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)',   # ,".*?"
220*cda5da8dSAndroid Build Coastguard Worker                      r'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'):                            #  ".*?" (no delim, no space)
221*cda5da8dSAndroid Build Coastguard Worker            regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
222*cda5da8dSAndroid Build Coastguard Worker            matches = regexp.findall(data)
223*cda5da8dSAndroid Build Coastguard Worker            if matches:
224*cda5da8dSAndroid Build Coastguard Worker                break
225*cda5da8dSAndroid Build Coastguard Worker
226*cda5da8dSAndroid Build Coastguard Worker        if not matches:
227*cda5da8dSAndroid Build Coastguard Worker            # (quotechar, doublequote, delimiter, skipinitialspace)
228*cda5da8dSAndroid Build Coastguard Worker            return ('', False, None, 0)
229*cda5da8dSAndroid Build Coastguard Worker        quotes = {}
230*cda5da8dSAndroid Build Coastguard Worker        delims = {}
231*cda5da8dSAndroid Build Coastguard Worker        spaces = 0
232*cda5da8dSAndroid Build Coastguard Worker        groupindex = regexp.groupindex
233*cda5da8dSAndroid Build Coastguard Worker        for m in matches:
234*cda5da8dSAndroid Build Coastguard Worker            n = groupindex['quote'] - 1
235*cda5da8dSAndroid Build Coastguard Worker            key = m[n]
236*cda5da8dSAndroid Build Coastguard Worker            if key:
237*cda5da8dSAndroid Build Coastguard Worker                quotes[key] = quotes.get(key, 0) + 1
238*cda5da8dSAndroid Build Coastguard Worker            try:
239*cda5da8dSAndroid Build Coastguard Worker                n = groupindex['delim'] - 1
240*cda5da8dSAndroid Build Coastguard Worker                key = m[n]
241*cda5da8dSAndroid Build Coastguard Worker            except KeyError:
242*cda5da8dSAndroid Build Coastguard Worker                continue
243*cda5da8dSAndroid Build Coastguard Worker            if key and (delimiters is None or key in delimiters):
244*cda5da8dSAndroid Build Coastguard Worker                delims[key] = delims.get(key, 0) + 1
245*cda5da8dSAndroid Build Coastguard Worker            try:
246*cda5da8dSAndroid Build Coastguard Worker                n = groupindex['space'] - 1
247*cda5da8dSAndroid Build Coastguard Worker            except KeyError:
248*cda5da8dSAndroid Build Coastguard Worker                continue
249*cda5da8dSAndroid Build Coastguard Worker            if m[n]:
250*cda5da8dSAndroid Build Coastguard Worker                spaces += 1
251*cda5da8dSAndroid Build Coastguard Worker
252*cda5da8dSAndroid Build Coastguard Worker        quotechar = max(quotes, key=quotes.get)
253*cda5da8dSAndroid Build Coastguard Worker
254*cda5da8dSAndroid Build Coastguard Worker        if delims:
255*cda5da8dSAndroid Build Coastguard Worker            delim = max(delims, key=delims.get)
256*cda5da8dSAndroid Build Coastguard Worker            skipinitialspace = delims[delim] == spaces
257*cda5da8dSAndroid Build Coastguard Worker            if delim == '\n': # most likely a file with a single column
258*cda5da8dSAndroid Build Coastguard Worker                delim = ''
259*cda5da8dSAndroid Build Coastguard Worker        else:
260*cda5da8dSAndroid Build Coastguard Worker            # there is *no* delimiter, it's a single column of quoted data
261*cda5da8dSAndroid Build Coastguard Worker            delim = ''
262*cda5da8dSAndroid Build Coastguard Worker            skipinitialspace = 0
263*cda5da8dSAndroid Build Coastguard Worker
264*cda5da8dSAndroid Build Coastguard Worker        # if we see an extra quote between delimiters, we've got a
265*cda5da8dSAndroid Build Coastguard Worker        # double quoted format
266*cda5da8dSAndroid Build Coastguard Worker        dq_regexp = re.compile(
267*cda5da8dSAndroid Build Coastguard Worker                               r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)" % \
268*cda5da8dSAndroid Build Coastguard Worker                               {'delim':re.escape(delim), 'quote':quotechar}, re.MULTILINE)
269*cda5da8dSAndroid Build Coastguard Worker
270*cda5da8dSAndroid Build Coastguard Worker
271*cda5da8dSAndroid Build Coastguard Worker
272*cda5da8dSAndroid Build Coastguard Worker        if dq_regexp.search(data):
273*cda5da8dSAndroid Build Coastguard Worker            doublequote = True
274*cda5da8dSAndroid Build Coastguard Worker        else:
275*cda5da8dSAndroid Build Coastguard Worker            doublequote = False
276*cda5da8dSAndroid Build Coastguard Worker
277*cda5da8dSAndroid Build Coastguard Worker        return (quotechar, doublequote, delim, skipinitialspace)
278*cda5da8dSAndroid Build Coastguard Worker
279*cda5da8dSAndroid Build Coastguard Worker
280*cda5da8dSAndroid Build Coastguard Worker    def _guess_delimiter(self, data, delimiters):
281*cda5da8dSAndroid Build Coastguard Worker        """
282*cda5da8dSAndroid Build Coastguard Worker        The delimiter /should/ occur the same number of times on
283*cda5da8dSAndroid Build Coastguard Worker        each row. However, due to malformed data, it may not. We don't want
284*cda5da8dSAndroid Build Coastguard Worker        an all or nothing approach, so we allow for small variations in this
285*cda5da8dSAndroid Build Coastguard Worker        number.
286*cda5da8dSAndroid Build Coastguard Worker          1) build a table of the frequency of each character on every line.
287*cda5da8dSAndroid Build Coastguard Worker          2) build a table of frequencies of this frequency (meta-frequency?),
288*cda5da8dSAndroid Build Coastguard Worker             e.g.  'x occurred 5 times in 10 rows, 6 times in 1000 rows,
289*cda5da8dSAndroid Build Coastguard Worker             7 times in 2 rows'
290*cda5da8dSAndroid Build Coastguard Worker          3) use the mode of the meta-frequency to determine the /expected/
291*cda5da8dSAndroid Build Coastguard Worker             frequency for that character
292*cda5da8dSAndroid Build Coastguard Worker          4) find out how often the character actually meets that goal
293*cda5da8dSAndroid Build Coastguard Worker          5) the character that best meets its goal is the delimiter
294*cda5da8dSAndroid Build Coastguard Worker        For performance reasons, the data is evaluated in chunks, so it can
295*cda5da8dSAndroid Build Coastguard Worker        try and evaluate the smallest portion of the data possible, evaluating
296*cda5da8dSAndroid Build Coastguard Worker        additional chunks as necessary.
297*cda5da8dSAndroid Build Coastguard Worker        """
298*cda5da8dSAndroid Build Coastguard Worker
299*cda5da8dSAndroid Build Coastguard Worker        data = list(filter(None, data.split('\n')))
300*cda5da8dSAndroid Build Coastguard Worker
301*cda5da8dSAndroid Build Coastguard Worker        ascii = [chr(c) for c in range(127)] # 7-bit ASCII
302*cda5da8dSAndroid Build Coastguard Worker
303*cda5da8dSAndroid Build Coastguard Worker        # build frequency tables
304*cda5da8dSAndroid Build Coastguard Worker        chunkLength = min(10, len(data))
305*cda5da8dSAndroid Build Coastguard Worker        iteration = 0
306*cda5da8dSAndroid Build Coastguard Worker        charFrequency = {}
307*cda5da8dSAndroid Build Coastguard Worker        modes = {}
308*cda5da8dSAndroid Build Coastguard Worker        delims = {}
309*cda5da8dSAndroid Build Coastguard Worker        start, end = 0, chunkLength
310*cda5da8dSAndroid Build Coastguard Worker        while start < len(data):
311*cda5da8dSAndroid Build Coastguard Worker            iteration += 1
312*cda5da8dSAndroid Build Coastguard Worker            for line in data[start:end]:
313*cda5da8dSAndroid Build Coastguard Worker                for char in ascii:
314*cda5da8dSAndroid Build Coastguard Worker                    metaFrequency = charFrequency.get(char, {})
315*cda5da8dSAndroid Build Coastguard Worker                    # must count even if frequency is 0
316*cda5da8dSAndroid Build Coastguard Worker                    freq = line.count(char)
317*cda5da8dSAndroid Build Coastguard Worker                    # value is the mode
318*cda5da8dSAndroid Build Coastguard Worker                    metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
319*cda5da8dSAndroid Build Coastguard Worker                    charFrequency[char] = metaFrequency
320*cda5da8dSAndroid Build Coastguard Worker
321*cda5da8dSAndroid Build Coastguard Worker            for char in charFrequency.keys():
322*cda5da8dSAndroid Build Coastguard Worker                items = list(charFrequency[char].items())
323*cda5da8dSAndroid Build Coastguard Worker                if len(items) == 1 and items[0][0] == 0:
324*cda5da8dSAndroid Build Coastguard Worker                    continue
325*cda5da8dSAndroid Build Coastguard Worker                # get the mode of the frequencies
326*cda5da8dSAndroid Build Coastguard Worker                if len(items) > 1:
327*cda5da8dSAndroid Build Coastguard Worker                    modes[char] = max(items, key=lambda x: x[1])
328*cda5da8dSAndroid Build Coastguard Worker                    # adjust the mode - subtract the sum of all
329*cda5da8dSAndroid Build Coastguard Worker                    # other frequencies
330*cda5da8dSAndroid Build Coastguard Worker                    items.remove(modes[char])
331*cda5da8dSAndroid Build Coastguard Worker                    modes[char] = (modes[char][0], modes[char][1]
332*cda5da8dSAndroid Build Coastguard Worker                                   - sum(item[1] for item in items))
333*cda5da8dSAndroid Build Coastguard Worker                else:
334*cda5da8dSAndroid Build Coastguard Worker                    modes[char] = items[0]
335*cda5da8dSAndroid Build Coastguard Worker
336*cda5da8dSAndroid Build Coastguard Worker            # build a list of possible delimiters
337*cda5da8dSAndroid Build Coastguard Worker            modeList = modes.items()
338*cda5da8dSAndroid Build Coastguard Worker            total = float(min(chunkLength * iteration, len(data)))
339*cda5da8dSAndroid Build Coastguard Worker            # (rows of consistent data) / (number of rows) = 100%
340*cda5da8dSAndroid Build Coastguard Worker            consistency = 1.0
341*cda5da8dSAndroid Build Coastguard Worker            # minimum consistency threshold
342*cda5da8dSAndroid Build Coastguard Worker            threshold = 0.9
343*cda5da8dSAndroid Build Coastguard Worker            while len(delims) == 0 and consistency >= threshold:
344*cda5da8dSAndroid Build Coastguard Worker                for k, v in modeList:
345*cda5da8dSAndroid Build Coastguard Worker                    if v[0] > 0 and v[1] > 0:
346*cda5da8dSAndroid Build Coastguard Worker                        if ((v[1]/total) >= consistency and
347*cda5da8dSAndroid Build Coastguard Worker                            (delimiters is None or k in delimiters)):
348*cda5da8dSAndroid Build Coastguard Worker                            delims[k] = v
349*cda5da8dSAndroid Build Coastguard Worker                consistency -= 0.01
350*cda5da8dSAndroid Build Coastguard Worker
351*cda5da8dSAndroid Build Coastguard Worker            if len(delims) == 1:
352*cda5da8dSAndroid Build Coastguard Worker                delim = list(delims.keys())[0]
353*cda5da8dSAndroid Build Coastguard Worker                skipinitialspace = (data[0].count(delim) ==
354*cda5da8dSAndroid Build Coastguard Worker                                    data[0].count("%c " % delim))
355*cda5da8dSAndroid Build Coastguard Worker                return (delim, skipinitialspace)
356*cda5da8dSAndroid Build Coastguard Worker
357*cda5da8dSAndroid Build Coastguard Worker            # analyze another chunkLength lines
358*cda5da8dSAndroid Build Coastguard Worker            start = end
359*cda5da8dSAndroid Build Coastguard Worker            end += chunkLength
360*cda5da8dSAndroid Build Coastguard Worker
361*cda5da8dSAndroid Build Coastguard Worker        if not delims:
362*cda5da8dSAndroid Build Coastguard Worker            return ('', 0)
363*cda5da8dSAndroid Build Coastguard Worker
364*cda5da8dSAndroid Build Coastguard Worker        # if there's more than one, fall back to a 'preferred' list
365*cda5da8dSAndroid Build Coastguard Worker        if len(delims) > 1:
366*cda5da8dSAndroid Build Coastguard Worker            for d in self.preferred:
367*cda5da8dSAndroid Build Coastguard Worker                if d in delims.keys():
368*cda5da8dSAndroid Build Coastguard Worker                    skipinitialspace = (data[0].count(d) ==
369*cda5da8dSAndroid Build Coastguard Worker                                        data[0].count("%c " % d))
370*cda5da8dSAndroid Build Coastguard Worker                    return (d, skipinitialspace)
371*cda5da8dSAndroid Build Coastguard Worker
372*cda5da8dSAndroid Build Coastguard Worker        # nothing else indicates a preference, pick the character that
373*cda5da8dSAndroid Build Coastguard Worker        # dominates(?)
374*cda5da8dSAndroid Build Coastguard Worker        items = [(v,k) for (k,v) in delims.items()]
375*cda5da8dSAndroid Build Coastguard Worker        items.sort()
376*cda5da8dSAndroid Build Coastguard Worker        delim = items[-1][1]
377*cda5da8dSAndroid Build Coastguard Worker
378*cda5da8dSAndroid Build Coastguard Worker        skipinitialspace = (data[0].count(delim) ==
379*cda5da8dSAndroid Build Coastguard Worker                            data[0].count("%c " % delim))
380*cda5da8dSAndroid Build Coastguard Worker        return (delim, skipinitialspace)
381*cda5da8dSAndroid Build Coastguard Worker
382*cda5da8dSAndroid Build Coastguard Worker
383*cda5da8dSAndroid Build Coastguard Worker    def has_header(self, sample):
384*cda5da8dSAndroid Build Coastguard Worker        # Creates a dictionary of types of data in each column. If any
385*cda5da8dSAndroid Build Coastguard Worker        # column is of a single type (say, integers), *except* for the first
386*cda5da8dSAndroid Build Coastguard Worker        # row, then the first row is presumed to be labels. If the type
387*cda5da8dSAndroid Build Coastguard Worker        # can't be determined, it is assumed to be a string in which case
388*cda5da8dSAndroid Build Coastguard Worker        # the length of the string is the determining factor: if all of the
389*cda5da8dSAndroid Build Coastguard Worker        # rows except for the first are the same length, it's a header.
390*cda5da8dSAndroid Build Coastguard Worker        # Finally, a 'vote' is taken at the end for each column, adding or
391*cda5da8dSAndroid Build Coastguard Worker        # subtracting from the likelihood of the first row being a header.
392*cda5da8dSAndroid Build Coastguard Worker
393*cda5da8dSAndroid Build Coastguard Worker        rdr = reader(StringIO(sample), self.sniff(sample))
394*cda5da8dSAndroid Build Coastguard Worker
395*cda5da8dSAndroid Build Coastguard Worker        header = next(rdr) # assume first row is header
396*cda5da8dSAndroid Build Coastguard Worker
397*cda5da8dSAndroid Build Coastguard Worker        columns = len(header)
398*cda5da8dSAndroid Build Coastguard Worker        columnTypes = {}
399*cda5da8dSAndroid Build Coastguard Worker        for i in range(columns): columnTypes[i] = None
400*cda5da8dSAndroid Build Coastguard Worker
401*cda5da8dSAndroid Build Coastguard Worker        checked = 0
402*cda5da8dSAndroid Build Coastguard Worker        for row in rdr:
403*cda5da8dSAndroid Build Coastguard Worker            # arbitrary number of rows to check, to keep it sane
404*cda5da8dSAndroid Build Coastguard Worker            if checked > 20:
405*cda5da8dSAndroid Build Coastguard Worker                break
406*cda5da8dSAndroid Build Coastguard Worker            checked += 1
407*cda5da8dSAndroid Build Coastguard Worker
408*cda5da8dSAndroid Build Coastguard Worker            if len(row) != columns:
409*cda5da8dSAndroid Build Coastguard Worker                continue # skip rows that have irregular number of columns
410*cda5da8dSAndroid Build Coastguard Worker
411*cda5da8dSAndroid Build Coastguard Worker            for col in list(columnTypes.keys()):
412*cda5da8dSAndroid Build Coastguard Worker                thisType = complex
413*cda5da8dSAndroid Build Coastguard Worker                try:
414*cda5da8dSAndroid Build Coastguard Worker                    thisType(row[col])
415*cda5da8dSAndroid Build Coastguard Worker                except (ValueError, OverflowError):
416*cda5da8dSAndroid Build Coastguard Worker                    # fallback to length of string
417*cda5da8dSAndroid Build Coastguard Worker                    thisType = len(row[col])
418*cda5da8dSAndroid Build Coastguard Worker
419*cda5da8dSAndroid Build Coastguard Worker                if thisType != columnTypes[col]:
420*cda5da8dSAndroid Build Coastguard Worker                    if columnTypes[col] is None: # add new column type
421*cda5da8dSAndroid Build Coastguard Worker                        columnTypes[col] = thisType
422*cda5da8dSAndroid Build Coastguard Worker                    else:
423*cda5da8dSAndroid Build Coastguard Worker                        # type is inconsistent, remove column from
424*cda5da8dSAndroid Build Coastguard Worker                        # consideration
425*cda5da8dSAndroid Build Coastguard Worker                        del columnTypes[col]
426*cda5da8dSAndroid Build Coastguard Worker
427*cda5da8dSAndroid Build Coastguard Worker        # finally, compare results against first row and "vote"
428*cda5da8dSAndroid Build Coastguard Worker        # on whether it's a header
429*cda5da8dSAndroid Build Coastguard Worker        hasHeader = 0
430*cda5da8dSAndroid Build Coastguard Worker        for col, colType in columnTypes.items():
431*cda5da8dSAndroid Build Coastguard Worker            if type(colType) == type(0): # it's a length
432*cda5da8dSAndroid Build Coastguard Worker                if len(header[col]) != colType:
433*cda5da8dSAndroid Build Coastguard Worker                    hasHeader += 1
434*cda5da8dSAndroid Build Coastguard Worker                else:
435*cda5da8dSAndroid Build Coastguard Worker                    hasHeader -= 1
436*cda5da8dSAndroid Build Coastguard Worker            else: # attempt typecast
437*cda5da8dSAndroid Build Coastguard Worker                try:
438*cda5da8dSAndroid Build Coastguard Worker                    colType(header[col])
439*cda5da8dSAndroid Build Coastguard Worker                except (ValueError, TypeError):
440*cda5da8dSAndroid Build Coastguard Worker                    hasHeader += 1
441*cda5da8dSAndroid Build Coastguard Worker                else:
442*cda5da8dSAndroid Build Coastguard Worker                    hasHeader -= 1
443*cda5da8dSAndroid Build Coastguard Worker
444*cda5da8dSAndroid Build Coastguard Worker        return hasHeader > 0
445