xref: /aosp_15_r20/prebuilts/build-tools/common/py3-stdlib/textwrap.py (revision cda5da8d549138a6648c5ee6d7a49cf8f4a657be)
1"""Text wrapping and filling.
2"""
3
4# Copyright (C) 1999-2001 Gregory P. Ward.
5# Copyright (C) 2002, 2003 Python Software Foundation.
6# Written by Greg Ward <[email protected]>
7
8import re
9
10__all__ = ['TextWrapper', 'wrap', 'fill', 'dedent', 'indent', 'shorten']
11
12# Hardcode the recognized whitespace characters to the US-ASCII
13# whitespace characters.  The main reason for doing this is that
14# some Unicode spaces (like \u00a0) are non-breaking whitespaces.
15_whitespace = '\t\n\x0b\x0c\r '
16
17class TextWrapper:
18    """
19    Object for wrapping/filling text.  The public interface consists of
20    the wrap() and fill() methods; the other methods are just there for
21    subclasses to override in order to tweak the default behaviour.
22    If you want to completely replace the main wrapping algorithm,
23    you'll probably have to override _wrap_chunks().
24
25    Several instance attributes control various aspects of wrapping:
26      width (default: 70)
27        the maximum width of wrapped lines (unless break_long_words
28        is false)
29      initial_indent (default: "")
30        string that will be prepended to the first line of wrapped
31        output.  Counts towards the line's width.
32      subsequent_indent (default: "")
33        string that will be prepended to all lines save the first
34        of wrapped output; also counts towards each line's width.
35      expand_tabs (default: true)
36        Expand tabs in input text to spaces before further processing.
37        Each tab will become 0 .. 'tabsize' spaces, depending on its position
38        in its line.  If false, each tab is treated as a single character.
39      tabsize (default: 8)
40        Expand tabs in input text to 0 .. 'tabsize' spaces, unless
41        'expand_tabs' is false.
42      replace_whitespace (default: true)
43        Replace all whitespace characters in the input text by spaces
44        after tab expansion.  Note that if expand_tabs is false and
45        replace_whitespace is true, every tab will be converted to a
46        single space!
47      fix_sentence_endings (default: false)
48        Ensure that sentence-ending punctuation is always followed
49        by two spaces.  Off by default because the algorithm is
50        (unavoidably) imperfect.
51      break_long_words (default: true)
52        Break words longer than 'width'.  If false, those words will not
53        be broken, and some lines might be longer than 'width'.
54      break_on_hyphens (default: true)
55        Allow breaking hyphenated words. If true, wrapping will occur
56        preferably on whitespaces and right after hyphens part of
57        compound words.
58      drop_whitespace (default: true)
59        Drop leading and trailing whitespace from lines.
60      max_lines (default: None)
61        Truncate wrapped lines.
62      placeholder (default: ' [...]')
63        Append to the last line of truncated text.
64    """
65
66    unicode_whitespace_trans = dict.fromkeys(map(ord, _whitespace), ord(' '))
67
68    # This funky little regex is just the trick for splitting
69    # text up into word-wrappable chunks.  E.g.
70    #   "Hello there -- you goof-ball, use the -b option!"
71    # splits into
72    #   Hello/ /there/ /--/ /you/ /goof-/ball,/ /use/ /the/ /-b/ /option!
73    # (after stripping out empty strings).
74    word_punct = r'[\w!"\'&.,?]'
75    letter = r'[^\d\W]'
76    whitespace = r'[%s]' % re.escape(_whitespace)
77    nowhitespace = '[^' + whitespace[1:]
78    wordsep_re = re.compile(r'''
79        ( # any whitespace
80          %(ws)s+
81        | # em-dash between words
82          (?<=%(wp)s) -{2,} (?=\w)
83        | # word, possibly hyphenated
84          %(nws)s+? (?:
85            # hyphenated word
86              -(?: (?<=%(lt)s{2}-) | (?<=%(lt)s-%(lt)s-))
87              (?= %(lt)s -? %(lt)s)
88            | # end of word
89              (?=%(ws)s|\Z)
90            | # em-dash
91              (?<=%(wp)s) (?=-{2,}\w)
92            )
93        )''' % {'wp': word_punct, 'lt': letter,
94                'ws': whitespace, 'nws': nowhitespace},
95        re.VERBOSE)
96    del word_punct, letter, nowhitespace
97
98    # This less funky little regex just split on recognized spaces. E.g.
99    #   "Hello there -- you goof-ball, use the -b option!"
100    # splits into
101    #   Hello/ /there/ /--/ /you/ /goof-ball,/ /use/ /the/ /-b/ /option!/
102    wordsep_simple_re = re.compile(r'(%s+)' % whitespace)
103    del whitespace
104
105    # XXX this is not locale- or charset-aware -- string.lowercase
106    # is US-ASCII only (and therefore English-only)
107    sentence_end_re = re.compile(r'[a-z]'             # lowercase letter
108                                 r'[\.\!\?]'          # sentence-ending punct.
109                                 r'[\"\']?'           # optional end-of-quote
110                                 r'\Z')               # end of chunk
111
112    def __init__(self,
113                 width=70,
114                 initial_indent="",
115                 subsequent_indent="",
116                 expand_tabs=True,
117                 replace_whitespace=True,
118                 fix_sentence_endings=False,
119                 break_long_words=True,
120                 drop_whitespace=True,
121                 break_on_hyphens=True,
122                 tabsize=8,
123                 *,
124                 max_lines=None,
125                 placeholder=' [...]'):
126        self.width = width
127        self.initial_indent = initial_indent
128        self.subsequent_indent = subsequent_indent
129        self.expand_tabs = expand_tabs
130        self.replace_whitespace = replace_whitespace
131        self.fix_sentence_endings = fix_sentence_endings
132        self.break_long_words = break_long_words
133        self.drop_whitespace = drop_whitespace
134        self.break_on_hyphens = break_on_hyphens
135        self.tabsize = tabsize
136        self.max_lines = max_lines
137        self.placeholder = placeholder
138
139
140    # -- Private methods -----------------------------------------------
141    # (possibly useful for subclasses to override)
142
143    def _munge_whitespace(self, text):
144        """_munge_whitespace(text : string) -> string
145
146        Munge whitespace in text: expand tabs and convert all other
147        whitespace characters to spaces.  Eg. " foo\\tbar\\n\\nbaz"
148        becomes " foo    bar  baz".
149        """
150        if self.expand_tabs:
151            text = text.expandtabs(self.tabsize)
152        if self.replace_whitespace:
153            text = text.translate(self.unicode_whitespace_trans)
154        return text
155
156
157    def _split(self, text):
158        """_split(text : string) -> [string]
159
160        Split the text to wrap into indivisible chunks.  Chunks are
161        not quite the same as words; see _wrap_chunks() for full
162        details.  As an example, the text
163          Look, goof-ball -- use the -b option!
164        breaks into the following chunks:
165          'Look,', ' ', 'goof-', 'ball', ' ', '--', ' ',
166          'use', ' ', 'the', ' ', '-b', ' ', 'option!'
167        if break_on_hyphens is True, or in:
168          'Look,', ' ', 'goof-ball', ' ', '--', ' ',
169          'use', ' ', 'the', ' ', '-b', ' ', option!'
170        otherwise.
171        """
172        if self.break_on_hyphens is True:
173            chunks = self.wordsep_re.split(text)
174        else:
175            chunks = self.wordsep_simple_re.split(text)
176        chunks = [c for c in chunks if c]
177        return chunks
178
179    def _fix_sentence_endings(self, chunks):
180        """_fix_sentence_endings(chunks : [string])
181
182        Correct for sentence endings buried in 'chunks'.  Eg. when the
183        original text contains "... foo.\\nBar ...", munge_whitespace()
184        and split() will convert that to [..., "foo.", " ", "Bar", ...]
185        which has one too few spaces; this method simply changes the one
186        space to two.
187        """
188        i = 0
189        patsearch = self.sentence_end_re.search
190        while i < len(chunks)-1:
191            if chunks[i+1] == " " and patsearch(chunks[i]):
192                chunks[i+1] = "  "
193                i += 2
194            else:
195                i += 1
196
197    def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width):
198        """_handle_long_word(chunks : [string],
199                             cur_line : [string],
200                             cur_len : int, width : int)
201
202        Handle a chunk of text (most likely a word, not whitespace) that
203        is too long to fit in any line.
204        """
205        # Figure out when indent is larger than the specified width, and make
206        # sure at least one character is stripped off on every pass
207        if width < 1:
208            space_left = 1
209        else:
210            space_left = width - cur_len
211
212        # If we're allowed to break long words, then do so: put as much
213        # of the next chunk onto the current line as will fit.
214        if self.break_long_words:
215            end = space_left
216            chunk = reversed_chunks[-1]
217            if self.break_on_hyphens and len(chunk) > space_left:
218                # break after last hyphen, but only if there are
219                # non-hyphens before it
220                hyphen = chunk.rfind('-', 0, space_left)
221                if hyphen > 0 and any(c != '-' for c in chunk[:hyphen]):
222                    end = hyphen + 1
223            cur_line.append(chunk[:end])
224            reversed_chunks[-1] = chunk[end:]
225
226        # Otherwise, we have to preserve the long word intact.  Only add
227        # it to the current line if there's nothing already there --
228        # that minimizes how much we violate the width constraint.
229        elif not cur_line:
230            cur_line.append(reversed_chunks.pop())
231
232        # If we're not allowed to break long words, and there's already
233        # text on the current line, do nothing.  Next time through the
234        # main loop of _wrap_chunks(), we'll wind up here again, but
235        # cur_len will be zero, so the next line will be entirely
236        # devoted to the long word that we can't handle right now.
237
238    def _wrap_chunks(self, chunks):
239        """_wrap_chunks(chunks : [string]) -> [string]
240
241        Wrap a sequence of text chunks and return a list of lines of
242        length 'self.width' or less.  (If 'break_long_words' is false,
243        some lines may be longer than this.)  Chunks correspond roughly
244        to words and the whitespace between them: each chunk is
245        indivisible (modulo 'break_long_words'), but a line break can
246        come between any two chunks.  Chunks should not have internal
247        whitespace; ie. a chunk is either all whitespace or a "word".
248        Whitespace chunks will be removed from the beginning and end of
249        lines, but apart from that whitespace is preserved.
250        """
251        lines = []
252        if self.width <= 0:
253            raise ValueError("invalid width %r (must be > 0)" % self.width)
254        if self.max_lines is not None:
255            if self.max_lines > 1:
256                indent = self.subsequent_indent
257            else:
258                indent = self.initial_indent
259            if len(indent) + len(self.placeholder.lstrip()) > self.width:
260                raise ValueError("placeholder too large for max width")
261
262        # Arrange in reverse order so items can be efficiently popped
263        # from a stack of chucks.
264        chunks.reverse()
265
266        while chunks:
267
268            # Start the list of chunks that will make up the current line.
269            # cur_len is just the length of all the chunks in cur_line.
270            cur_line = []
271            cur_len = 0
272
273            # Figure out which static string will prefix this line.
274            if lines:
275                indent = self.subsequent_indent
276            else:
277                indent = self.initial_indent
278
279            # Maximum width for this line.
280            width = self.width - len(indent)
281
282            # First chunk on line is whitespace -- drop it, unless this
283            # is the very beginning of the text (ie. no lines started yet).
284            if self.drop_whitespace and chunks[-1].strip() == '' and lines:
285                del chunks[-1]
286
287            while chunks:
288                l = len(chunks[-1])
289
290                # Can at least squeeze this chunk onto the current line.
291                if cur_len + l <= width:
292                    cur_line.append(chunks.pop())
293                    cur_len += l
294
295                # Nope, this line is full.
296                else:
297                    break
298
299            # The current line is full, and the next chunk is too big to
300            # fit on *any* line (not just this one).
301            if chunks and len(chunks[-1]) > width:
302                self._handle_long_word(chunks, cur_line, cur_len, width)
303                cur_len = sum(map(len, cur_line))
304
305            # If the last chunk on this line is all whitespace, drop it.
306            if self.drop_whitespace and cur_line and cur_line[-1].strip() == '':
307                cur_len -= len(cur_line[-1])
308                del cur_line[-1]
309
310            if cur_line:
311                if (self.max_lines is None or
312                    len(lines) + 1 < self.max_lines or
313                    (not chunks or
314                     self.drop_whitespace and
315                     len(chunks) == 1 and
316                     not chunks[0].strip()) and cur_len <= width):
317                    # Convert current line back to a string and store it in
318                    # list of all lines (return value).
319                    lines.append(indent + ''.join(cur_line))
320                else:
321                    while cur_line:
322                        if (cur_line[-1].strip() and
323                            cur_len + len(self.placeholder) <= width):
324                            cur_line.append(self.placeholder)
325                            lines.append(indent + ''.join(cur_line))
326                            break
327                        cur_len -= len(cur_line[-1])
328                        del cur_line[-1]
329                    else:
330                        if lines:
331                            prev_line = lines[-1].rstrip()
332                            if (len(prev_line) + len(self.placeholder) <=
333                                    self.width):
334                                lines[-1] = prev_line + self.placeholder
335                                break
336                        lines.append(indent + self.placeholder.lstrip())
337                    break
338
339        return lines
340
341    def _split_chunks(self, text):
342        text = self._munge_whitespace(text)
343        return self._split(text)
344
345    # -- Public interface ----------------------------------------------
346
347    def wrap(self, text):
348        """wrap(text : string) -> [string]
349
350        Reformat the single paragraph in 'text' so it fits in lines of
351        no more than 'self.width' columns, and return a list of wrapped
352        lines.  Tabs in 'text' are expanded with string.expandtabs(),
353        and all other whitespace characters (including newline) are
354        converted to space.
355        """
356        chunks = self._split_chunks(text)
357        if self.fix_sentence_endings:
358            self._fix_sentence_endings(chunks)
359        return self._wrap_chunks(chunks)
360
361    def fill(self, text):
362        """fill(text : string) -> string
363
364        Reformat the single paragraph in 'text' to fit in lines of no
365        more than 'self.width' columns, and return a new string
366        containing the entire wrapped paragraph.
367        """
368        return "\n".join(self.wrap(text))
369
370
371# -- Convenience interface ---------------------------------------------
372
373def wrap(text, width=70, **kwargs):
374    """Wrap a single paragraph of text, returning a list of wrapped lines.
375
376    Reformat the single paragraph in 'text' so it fits in lines of no
377    more than 'width' columns, and return a list of wrapped lines.  By
378    default, tabs in 'text' are expanded with string.expandtabs(), and
379    all other whitespace characters (including newline) are converted to
380    space.  See TextWrapper class for available keyword args to customize
381    wrapping behaviour.
382    """
383    w = TextWrapper(width=width, **kwargs)
384    return w.wrap(text)
385
386def fill(text, width=70, **kwargs):
387    """Fill a single paragraph of text, returning a new string.
388
389    Reformat the single paragraph in 'text' to fit in lines of no more
390    than 'width' columns, and return a new string containing the entire
391    wrapped paragraph.  As with wrap(), tabs are expanded and other
392    whitespace characters converted to space.  See TextWrapper class for
393    available keyword args to customize wrapping behaviour.
394    """
395    w = TextWrapper(width=width, **kwargs)
396    return w.fill(text)
397
398def shorten(text, width, **kwargs):
399    """Collapse and truncate the given text to fit in the given width.
400
401    The text first has its whitespace collapsed.  If it then fits in
402    the *width*, it is returned as is.  Otherwise, as many words
403    as possible are joined and then the placeholder is appended::
404
405        >>> textwrap.shorten("Hello  world!", width=12)
406        'Hello world!'
407        >>> textwrap.shorten("Hello  world!", width=11)
408        'Hello [...]'
409    """
410    w = TextWrapper(width=width, max_lines=1, **kwargs)
411    return w.fill(' '.join(text.strip().split()))
412
413
414# -- Loosely related functionality -------------------------------------
415
416_whitespace_only_re = re.compile('^[ \t]+$', re.MULTILINE)
417_leading_whitespace_re = re.compile('(^[ \t]*)(?:[^ \t\n])', re.MULTILINE)
418
419def dedent(text):
420    """Remove any common leading whitespace from every line in `text`.
421
422    This can be used to make triple-quoted strings line up with the left
423    edge of the display, while still presenting them in the source code
424    in indented form.
425
426    Note that tabs and spaces are both treated as whitespace, but they
427    are not equal: the lines "  hello" and "\\thello" are
428    considered to have no common leading whitespace.
429
430    Entirely blank lines are normalized to a newline character.
431    """
432    # Look for the longest leading string of spaces and tabs common to
433    # all lines.
434    margin = None
435    text = _whitespace_only_re.sub('', text)
436    indents = _leading_whitespace_re.findall(text)
437    for indent in indents:
438        if margin is None:
439            margin = indent
440
441        # Current line more deeply indented than previous winner:
442        # no change (previous winner is still on top).
443        elif indent.startswith(margin):
444            pass
445
446        # Current line consistent with and no deeper than previous winner:
447        # it's the new winner.
448        elif margin.startswith(indent):
449            margin = indent
450
451        # Find the largest common whitespace between current line and previous
452        # winner.
453        else:
454            for i, (x, y) in enumerate(zip(margin, indent)):
455                if x != y:
456                    margin = margin[:i]
457                    break
458
459    # sanity check (testing/debugging only)
460    if 0 and margin:
461        for line in text.split("\n"):
462            assert not line or line.startswith(margin), \
463                   "line = %r, margin = %r" % (line, margin)
464
465    if margin:
466        text = re.sub(r'(?m)^' + margin, '', text)
467    return text
468
469
470def indent(text, prefix, predicate=None):
471    """Adds 'prefix' to the beginning of selected lines in 'text'.
472
473    If 'predicate' is provided, 'prefix' will only be added to the lines
474    where 'predicate(line)' is True. If 'predicate' is not provided,
475    it will default to adding 'prefix' to all non-empty lines that do not
476    consist solely of whitespace characters.
477    """
478    if predicate is None:
479        def predicate(line):
480            return line.strip()
481
482    def prefixed_lines():
483        for line in text.splitlines(True):
484            yield (prefix + line if predicate(line) else line)
485    return ''.join(prefixed_lines())
486
487
488if __name__ == "__main__":
489    #print dedent("\tfoo\n\tbar")
490    #print dedent("  \thello there\n  \t  how are you?")
491    print(dedent("Hello there.\n  This is indented."))
492