1"""Text wrapping and filling. 2""" 3 4# Copyright (C) 1999-2001 Gregory P. Ward. 5# Copyright (C) 2002, 2003 Python Software Foundation. 6# Written by Greg Ward <[email protected]> 7 8import re 9 10__all__ = ['TextWrapper', 'wrap', 'fill', 'dedent', 'indent', 'shorten'] 11 12# Hardcode the recognized whitespace characters to the US-ASCII 13# whitespace characters. The main reason for doing this is that 14# some Unicode spaces (like \u00a0) are non-breaking whitespaces. 15_whitespace = '\t\n\x0b\x0c\r ' 16 17class TextWrapper: 18 """ 19 Object for wrapping/filling text. The public interface consists of 20 the wrap() and fill() methods; the other methods are just there for 21 subclasses to override in order to tweak the default behaviour. 22 If you want to completely replace the main wrapping algorithm, 23 you'll probably have to override _wrap_chunks(). 24 25 Several instance attributes control various aspects of wrapping: 26 width (default: 70) 27 the maximum width of wrapped lines (unless break_long_words 28 is false) 29 initial_indent (default: "") 30 string that will be prepended to the first line of wrapped 31 output. Counts towards the line's width. 32 subsequent_indent (default: "") 33 string that will be prepended to all lines save the first 34 of wrapped output; also counts towards each line's width. 35 expand_tabs (default: true) 36 Expand tabs in input text to spaces before further processing. 37 Each tab will become 0 .. 'tabsize' spaces, depending on its position 38 in its line. If false, each tab is treated as a single character. 39 tabsize (default: 8) 40 Expand tabs in input text to 0 .. 'tabsize' spaces, unless 41 'expand_tabs' is false. 42 replace_whitespace (default: true) 43 Replace all whitespace characters in the input text by spaces 44 after tab expansion. Note that if expand_tabs is false and 45 replace_whitespace is true, every tab will be converted to a 46 single space! 47 fix_sentence_endings (default: false) 48 Ensure that sentence-ending punctuation is always followed 49 by two spaces. Off by default because the algorithm is 50 (unavoidably) imperfect. 51 break_long_words (default: true) 52 Break words longer than 'width'. If false, those words will not 53 be broken, and some lines might be longer than 'width'. 54 break_on_hyphens (default: true) 55 Allow breaking hyphenated words. If true, wrapping will occur 56 preferably on whitespaces and right after hyphens part of 57 compound words. 58 drop_whitespace (default: true) 59 Drop leading and trailing whitespace from lines. 60 max_lines (default: None) 61 Truncate wrapped lines. 62 placeholder (default: ' [...]') 63 Append to the last line of truncated text. 64 """ 65 66 unicode_whitespace_trans = dict.fromkeys(map(ord, _whitespace), ord(' ')) 67 68 # This funky little regex is just the trick for splitting 69 # text up into word-wrappable chunks. E.g. 70 # "Hello there -- you goof-ball, use the -b option!" 71 # splits into 72 # Hello/ /there/ /--/ /you/ /goof-/ball,/ /use/ /the/ /-b/ /option! 73 # (after stripping out empty strings). 74 word_punct = r'[\w!"\'&.,?]' 75 letter = r'[^\d\W]' 76 whitespace = r'[%s]' % re.escape(_whitespace) 77 nowhitespace = '[^' + whitespace[1:] 78 wordsep_re = re.compile(r''' 79 ( # any whitespace 80 %(ws)s+ 81 | # em-dash between words 82 (?<=%(wp)s) -{2,} (?=\w) 83 | # word, possibly hyphenated 84 %(nws)s+? (?: 85 # hyphenated word 86 -(?: (?<=%(lt)s{2}-) | (?<=%(lt)s-%(lt)s-)) 87 (?= %(lt)s -? %(lt)s) 88 | # end of word 89 (?=%(ws)s|\Z) 90 | # em-dash 91 (?<=%(wp)s) (?=-{2,}\w) 92 ) 93 )''' % {'wp': word_punct, 'lt': letter, 94 'ws': whitespace, 'nws': nowhitespace}, 95 re.VERBOSE) 96 del word_punct, letter, nowhitespace 97 98 # This less funky little regex just split on recognized spaces. E.g. 99 # "Hello there -- you goof-ball, use the -b option!" 100 # splits into 101 # Hello/ /there/ /--/ /you/ /goof-ball,/ /use/ /the/ /-b/ /option!/ 102 wordsep_simple_re = re.compile(r'(%s+)' % whitespace) 103 del whitespace 104 105 # XXX this is not locale- or charset-aware -- string.lowercase 106 # is US-ASCII only (and therefore English-only) 107 sentence_end_re = re.compile(r'[a-z]' # lowercase letter 108 r'[\.\!\?]' # sentence-ending punct. 109 r'[\"\']?' # optional end-of-quote 110 r'\Z') # end of chunk 111 112 def __init__(self, 113 width=70, 114 initial_indent="", 115 subsequent_indent="", 116 expand_tabs=True, 117 replace_whitespace=True, 118 fix_sentence_endings=False, 119 break_long_words=True, 120 drop_whitespace=True, 121 break_on_hyphens=True, 122 tabsize=8, 123 *, 124 max_lines=None, 125 placeholder=' [...]'): 126 self.width = width 127 self.initial_indent = initial_indent 128 self.subsequent_indent = subsequent_indent 129 self.expand_tabs = expand_tabs 130 self.replace_whitespace = replace_whitespace 131 self.fix_sentence_endings = fix_sentence_endings 132 self.break_long_words = break_long_words 133 self.drop_whitespace = drop_whitespace 134 self.break_on_hyphens = break_on_hyphens 135 self.tabsize = tabsize 136 self.max_lines = max_lines 137 self.placeholder = placeholder 138 139 140 # -- Private methods ----------------------------------------------- 141 # (possibly useful for subclasses to override) 142 143 def _munge_whitespace(self, text): 144 """_munge_whitespace(text : string) -> string 145 146 Munge whitespace in text: expand tabs and convert all other 147 whitespace characters to spaces. Eg. " foo\\tbar\\n\\nbaz" 148 becomes " foo bar baz". 149 """ 150 if self.expand_tabs: 151 text = text.expandtabs(self.tabsize) 152 if self.replace_whitespace: 153 text = text.translate(self.unicode_whitespace_trans) 154 return text 155 156 157 def _split(self, text): 158 """_split(text : string) -> [string] 159 160 Split the text to wrap into indivisible chunks. Chunks are 161 not quite the same as words; see _wrap_chunks() for full 162 details. As an example, the text 163 Look, goof-ball -- use the -b option! 164 breaks into the following chunks: 165 'Look,', ' ', 'goof-', 'ball', ' ', '--', ' ', 166 'use', ' ', 'the', ' ', '-b', ' ', 'option!' 167 if break_on_hyphens is True, or in: 168 'Look,', ' ', 'goof-ball', ' ', '--', ' ', 169 'use', ' ', 'the', ' ', '-b', ' ', option!' 170 otherwise. 171 """ 172 if self.break_on_hyphens is True: 173 chunks = self.wordsep_re.split(text) 174 else: 175 chunks = self.wordsep_simple_re.split(text) 176 chunks = [c for c in chunks if c] 177 return chunks 178 179 def _fix_sentence_endings(self, chunks): 180 """_fix_sentence_endings(chunks : [string]) 181 182 Correct for sentence endings buried in 'chunks'. Eg. when the 183 original text contains "... foo.\\nBar ...", munge_whitespace() 184 and split() will convert that to [..., "foo.", " ", "Bar", ...] 185 which has one too few spaces; this method simply changes the one 186 space to two. 187 """ 188 i = 0 189 patsearch = self.sentence_end_re.search 190 while i < len(chunks)-1: 191 if chunks[i+1] == " " and patsearch(chunks[i]): 192 chunks[i+1] = " " 193 i += 2 194 else: 195 i += 1 196 197 def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width): 198 """_handle_long_word(chunks : [string], 199 cur_line : [string], 200 cur_len : int, width : int) 201 202 Handle a chunk of text (most likely a word, not whitespace) that 203 is too long to fit in any line. 204 """ 205 # Figure out when indent is larger than the specified width, and make 206 # sure at least one character is stripped off on every pass 207 if width < 1: 208 space_left = 1 209 else: 210 space_left = width - cur_len 211 212 # If we're allowed to break long words, then do so: put as much 213 # of the next chunk onto the current line as will fit. 214 if self.break_long_words: 215 end = space_left 216 chunk = reversed_chunks[-1] 217 if self.break_on_hyphens and len(chunk) > space_left: 218 # break after last hyphen, but only if there are 219 # non-hyphens before it 220 hyphen = chunk.rfind('-', 0, space_left) 221 if hyphen > 0 and any(c != '-' for c in chunk[:hyphen]): 222 end = hyphen + 1 223 cur_line.append(chunk[:end]) 224 reversed_chunks[-1] = chunk[end:] 225 226 # Otherwise, we have to preserve the long word intact. Only add 227 # it to the current line if there's nothing already there -- 228 # that minimizes how much we violate the width constraint. 229 elif not cur_line: 230 cur_line.append(reversed_chunks.pop()) 231 232 # If we're not allowed to break long words, and there's already 233 # text on the current line, do nothing. Next time through the 234 # main loop of _wrap_chunks(), we'll wind up here again, but 235 # cur_len will be zero, so the next line will be entirely 236 # devoted to the long word that we can't handle right now. 237 238 def _wrap_chunks(self, chunks): 239 """_wrap_chunks(chunks : [string]) -> [string] 240 241 Wrap a sequence of text chunks and return a list of lines of 242 length 'self.width' or less. (If 'break_long_words' is false, 243 some lines may be longer than this.) Chunks correspond roughly 244 to words and the whitespace between them: each chunk is 245 indivisible (modulo 'break_long_words'), but a line break can 246 come between any two chunks. Chunks should not have internal 247 whitespace; ie. a chunk is either all whitespace or a "word". 248 Whitespace chunks will be removed from the beginning and end of 249 lines, but apart from that whitespace is preserved. 250 """ 251 lines = [] 252 if self.width <= 0: 253 raise ValueError("invalid width %r (must be > 0)" % self.width) 254 if self.max_lines is not None: 255 if self.max_lines > 1: 256 indent = self.subsequent_indent 257 else: 258 indent = self.initial_indent 259 if len(indent) + len(self.placeholder.lstrip()) > self.width: 260 raise ValueError("placeholder too large for max width") 261 262 # Arrange in reverse order so items can be efficiently popped 263 # from a stack of chucks. 264 chunks.reverse() 265 266 while chunks: 267 268 # Start the list of chunks that will make up the current line. 269 # cur_len is just the length of all the chunks in cur_line. 270 cur_line = [] 271 cur_len = 0 272 273 # Figure out which static string will prefix this line. 274 if lines: 275 indent = self.subsequent_indent 276 else: 277 indent = self.initial_indent 278 279 # Maximum width for this line. 280 width = self.width - len(indent) 281 282 # First chunk on line is whitespace -- drop it, unless this 283 # is the very beginning of the text (ie. no lines started yet). 284 if self.drop_whitespace and chunks[-1].strip() == '' and lines: 285 del chunks[-1] 286 287 while chunks: 288 l = len(chunks[-1]) 289 290 # Can at least squeeze this chunk onto the current line. 291 if cur_len + l <= width: 292 cur_line.append(chunks.pop()) 293 cur_len += l 294 295 # Nope, this line is full. 296 else: 297 break 298 299 # The current line is full, and the next chunk is too big to 300 # fit on *any* line (not just this one). 301 if chunks and len(chunks[-1]) > width: 302 self._handle_long_word(chunks, cur_line, cur_len, width) 303 cur_len = sum(map(len, cur_line)) 304 305 # If the last chunk on this line is all whitespace, drop it. 306 if self.drop_whitespace and cur_line and cur_line[-1].strip() == '': 307 cur_len -= len(cur_line[-1]) 308 del cur_line[-1] 309 310 if cur_line: 311 if (self.max_lines is None or 312 len(lines) + 1 < self.max_lines or 313 (not chunks or 314 self.drop_whitespace and 315 len(chunks) == 1 and 316 not chunks[0].strip()) and cur_len <= width): 317 # Convert current line back to a string and store it in 318 # list of all lines (return value). 319 lines.append(indent + ''.join(cur_line)) 320 else: 321 while cur_line: 322 if (cur_line[-1].strip() and 323 cur_len + len(self.placeholder) <= width): 324 cur_line.append(self.placeholder) 325 lines.append(indent + ''.join(cur_line)) 326 break 327 cur_len -= len(cur_line[-1]) 328 del cur_line[-1] 329 else: 330 if lines: 331 prev_line = lines[-1].rstrip() 332 if (len(prev_line) + len(self.placeholder) <= 333 self.width): 334 lines[-1] = prev_line + self.placeholder 335 break 336 lines.append(indent + self.placeholder.lstrip()) 337 break 338 339 return lines 340 341 def _split_chunks(self, text): 342 text = self._munge_whitespace(text) 343 return self._split(text) 344 345 # -- Public interface ---------------------------------------------- 346 347 def wrap(self, text): 348 """wrap(text : string) -> [string] 349 350 Reformat the single paragraph in 'text' so it fits in lines of 351 no more than 'self.width' columns, and return a list of wrapped 352 lines. Tabs in 'text' are expanded with string.expandtabs(), 353 and all other whitespace characters (including newline) are 354 converted to space. 355 """ 356 chunks = self._split_chunks(text) 357 if self.fix_sentence_endings: 358 self._fix_sentence_endings(chunks) 359 return self._wrap_chunks(chunks) 360 361 def fill(self, text): 362 """fill(text : string) -> string 363 364 Reformat the single paragraph in 'text' to fit in lines of no 365 more than 'self.width' columns, and return a new string 366 containing the entire wrapped paragraph. 367 """ 368 return "\n".join(self.wrap(text)) 369 370 371# -- Convenience interface --------------------------------------------- 372 373def wrap(text, width=70, **kwargs): 374 """Wrap a single paragraph of text, returning a list of wrapped lines. 375 376 Reformat the single paragraph in 'text' so it fits in lines of no 377 more than 'width' columns, and return a list of wrapped lines. By 378 default, tabs in 'text' are expanded with string.expandtabs(), and 379 all other whitespace characters (including newline) are converted to 380 space. See TextWrapper class for available keyword args to customize 381 wrapping behaviour. 382 """ 383 w = TextWrapper(width=width, **kwargs) 384 return w.wrap(text) 385 386def fill(text, width=70, **kwargs): 387 """Fill a single paragraph of text, returning a new string. 388 389 Reformat the single paragraph in 'text' to fit in lines of no more 390 than 'width' columns, and return a new string containing the entire 391 wrapped paragraph. As with wrap(), tabs are expanded and other 392 whitespace characters converted to space. See TextWrapper class for 393 available keyword args to customize wrapping behaviour. 394 """ 395 w = TextWrapper(width=width, **kwargs) 396 return w.fill(text) 397 398def shorten(text, width, **kwargs): 399 """Collapse and truncate the given text to fit in the given width. 400 401 The text first has its whitespace collapsed. If it then fits in 402 the *width*, it is returned as is. Otherwise, as many words 403 as possible are joined and then the placeholder is appended:: 404 405 >>> textwrap.shorten("Hello world!", width=12) 406 'Hello world!' 407 >>> textwrap.shorten("Hello world!", width=11) 408 'Hello [...]' 409 """ 410 w = TextWrapper(width=width, max_lines=1, **kwargs) 411 return w.fill(' '.join(text.strip().split())) 412 413 414# -- Loosely related functionality ------------------------------------- 415 416_whitespace_only_re = re.compile('^[ \t]+$', re.MULTILINE) 417_leading_whitespace_re = re.compile('(^[ \t]*)(?:[^ \t\n])', re.MULTILINE) 418 419def dedent(text): 420 """Remove any common leading whitespace from every line in `text`. 421 422 This can be used to make triple-quoted strings line up with the left 423 edge of the display, while still presenting them in the source code 424 in indented form. 425 426 Note that tabs and spaces are both treated as whitespace, but they 427 are not equal: the lines " hello" and "\\thello" are 428 considered to have no common leading whitespace. 429 430 Entirely blank lines are normalized to a newline character. 431 """ 432 # Look for the longest leading string of spaces and tabs common to 433 # all lines. 434 margin = None 435 text = _whitespace_only_re.sub('', text) 436 indents = _leading_whitespace_re.findall(text) 437 for indent in indents: 438 if margin is None: 439 margin = indent 440 441 # Current line more deeply indented than previous winner: 442 # no change (previous winner is still on top). 443 elif indent.startswith(margin): 444 pass 445 446 # Current line consistent with and no deeper than previous winner: 447 # it's the new winner. 448 elif margin.startswith(indent): 449 margin = indent 450 451 # Find the largest common whitespace between current line and previous 452 # winner. 453 else: 454 for i, (x, y) in enumerate(zip(margin, indent)): 455 if x != y: 456 margin = margin[:i] 457 break 458 459 # sanity check (testing/debugging only) 460 if 0 and margin: 461 for line in text.split("\n"): 462 assert not line or line.startswith(margin), \ 463 "line = %r, margin = %r" % (line, margin) 464 465 if margin: 466 text = re.sub(r'(?m)^' + margin, '', text) 467 return text 468 469 470def indent(text, prefix, predicate=None): 471 """Adds 'prefix' to the beginning of selected lines in 'text'. 472 473 If 'predicate' is provided, 'prefix' will only be added to the lines 474 where 'predicate(line)' is True. If 'predicate' is not provided, 475 it will default to adding 'prefix' to all non-empty lines that do not 476 consist solely of whitespace characters. 477 """ 478 if predicate is None: 479 def predicate(line): 480 return line.strip() 481 482 def prefixed_lines(): 483 for line in text.splitlines(True): 484 yield (prefix + line if predicate(line) else line) 485 return ''.join(prefixed_lines()) 486 487 488if __name__ == "__main__": 489 #print dedent("\tfoo\n\tbar") 490 #print dedent(" \thello there\n \t how are you?") 491 print(dedent("Hello there.\n This is indented.")) 492