1# Copyright 2021 The Pigweed Authors 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); you may not 4# use this file except in compliance with the License. You may obtain a copy of 5# the License at 6# 7# https://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 11# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 12# License for the specific language governing permissions and limitations under 13# the License. 14"""Inclusive language presubmit check.""" 15 16import dataclasses 17from pathlib import Path 18import re 19 20from . import presubmit, presubmit_context 21 22# List borrowed from Android: 23# https://source.android.com/setup/contribute/respectful-code 24# inclusive-language: disable 25NON_INCLUSIVE_WORDS = [ 26 r'master', 27 r'slave', 28 r'red[-\s]?line', 29 r'(white|gr[ae]y|black)[-\s]*(list|hat)', 30 r'craz(y|ie)', 31 r'insane', 32 r'crip+led?', 33 r'sanity', 34 r'sane', 35 r'dummy', 36 r'grandfather', 37 r's?he', 38 r'his', 39 r'her', 40 r'm[ae]n[-\s]*in[-\s]*the[-\s]*middle', 41 r'mitm', 42 r'first[-\s]?class[-\s]?citizen', 43] 44# inclusive-language: enable 45 46# Test: master # inclusive-language: ignore 47# Test: master 48 49 50def _process_inclusive_language(*words): 51 """Turn word list into one big regex with common inflections.""" 52 53 if not words: 54 words = tuple(NON_INCLUSIVE_WORDS) 55 56 all_words = [] 57 for entry in words: 58 if isinstance(entry, str): 59 all_words.append(entry) 60 elif isinstance(entry, (list, tuple)): 61 all_words.extend(entry) 62 all_words.extend(x for x in words) 63 all_words = tuple(all_words) 64 65 # Confirm each individual word compiles as a valid regex. 66 for word in all_words: 67 _ = re.compile(word) 68 69 word_boundary = ( 70 r'(\b|_|(?<=[a-z])(?=[A-Z])|(?<=[0-9])(?=\w)|(?<=\w)(?=[0-9]))' 71 ) 72 73 return re.compile( 74 r"({b})(?i:{w})(e?[sd]{b}|{b})".format( 75 w='|'.join(all_words), b=word_boundary 76 ), 77 ) 78 79 80NON_INCLUSIVE_WORDS_REGEX = _process_inclusive_language() 81 82# If seen, ignore this line and the next. 83IGNORE = 'inclusive-language: ignore' 84 85# Ignore a whole section. Please do not change the order of these lines. 86DISABLE = 'inclusive-language: disable' 87ENABLE = 'inclusive-language: enable' 88 89 90@dataclasses.dataclass 91class PathMatch: 92 word: str 93 94 def __repr__(self): 95 return f'Found non-inclusive word "{self.word}" in file path' 96 97 98@dataclasses.dataclass 99class LineMatch: 100 line: int 101 word: str 102 103 def __repr__(self): 104 return f'Found non-inclusive word "{self.word}" on line {self.line}' 105 106 107def check_file( 108 path: Path, 109 found_words: dict[Path, list[PathMatch | LineMatch]], 110 words_regex: re.Pattern = NON_INCLUSIVE_WORDS_REGEX, 111 check_path: bool = True, 112 root: Path | None = None, 113): 114 """Check one file for non-inclusive language. 115 116 Args: 117 path: File to check. 118 found_words: Output. Data structure where found words are added. 119 words_regex: Pattern of non-inclusive terms. 120 check_path: Whether to check the path instead of just the contents. 121 (Used for testing.) 122 root: Path to add as a prefix to path. 123 """ 124 if check_path: 125 match = words_regex.search(str(path)) 126 if match: 127 found_words.setdefault(path, []) 128 found_words[path].append(PathMatch(match.group(0))) 129 130 if path.is_symlink() or path.is_dir(): 131 return 132 133 try: 134 if root: 135 path = root / path 136 137 with open(path, 'r') as ins: 138 enabled = True 139 prev = '' 140 for i, line in enumerate(ins, start=1): 141 if DISABLE in line: 142 enabled = False 143 if ENABLE in line: 144 enabled = True 145 146 # If we see the ignore line on this or the previous line we 147 # ignore any bad words on this line. 148 ignored = IGNORE in prev or IGNORE in line 149 150 if enabled and not ignored: 151 match = words_regex.search(line) 152 153 if match: 154 found_words.setdefault(path, []) 155 found_words[path].append(LineMatch(i, match.group(0))) 156 157 # Not using 'continue' so this line always executes. 158 prev = line 159 160 except UnicodeDecodeError: 161 # File is not text, like a gif. 162 pass 163 164 165@presubmit.check(name='inclusive_language') 166def presubmit_check( 167 ctx: presubmit_context.PresubmitContext, 168 words_regex=NON_INCLUSIVE_WORDS_REGEX, 169): 170 """Presubmit check that ensures files do not contain banned words.""" 171 172 # No subprocesses are run for inclusive_language so don't perform this check 173 # if dry_run is on. 174 if ctx.dry_run: 175 return 176 177 found_words: dict[Path, list[PathMatch | LineMatch]] = {} 178 179 ctx.paths = presubmit_context.apply_exclusions(ctx) 180 181 for path in ctx.paths: 182 check_file( 183 path.relative_to(ctx.root), 184 found_words, 185 words_regex, 186 root=ctx.root, 187 ) 188 189 if found_words: 190 with open(ctx.failure_summary_log, 'w') as outs: 191 for i, (path, matches) in enumerate(found_words.items()): 192 if i: 193 print('=' * 40, file=outs) 194 print(path, file=outs) 195 for match in matches: 196 print(match, file=outs) 197 198 print(ctx.failure_summary_log.read_text(), end=None) 199 200 print() 201 print( 202 """ 203Individual lines can be ignored with "inclusive-language: ignore". Blocks can be 204ignored with "inclusive-language: disable" and reenabled with 205"inclusive-language: enable". 206""".strip() 207 ) 208 # Re-enable just in case: inclusive-language: enable. 209 210 raise presubmit_context.PresubmitFailure 211 212 213def inclusive_language_checker(*words): 214 """Create banned words checker for the given list of banned words.""" 215 216 regex = _process_inclusive_language(*words) 217 218 def inclusive_language( # pylint: disable=redefined-outer-name 219 ctx: presubmit_context.PresubmitContext, 220 ): 221 globals()['inclusive_language'](ctx, regex) 222 223 return inclusive_language 224