xref: /aosp_15_r20/external/pigweed/pw_presubmit/py/pw_presubmit/inclusive_language.py (revision 61c4878ac05f98d0ceed94b57d316916de578985)
1# Copyright 2021 The Pigweed Authors
2#
3# Licensed under the Apache License, Version 2.0 (the "License"); you may not
4# use this file except in compliance with the License. You may obtain a copy of
5# the License at
6#
7#     https://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
11# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
12# License for the specific language governing permissions and limitations under
13# the License.
14"""Inclusive language presubmit check."""
15
16import dataclasses
17from pathlib import Path
18import re
19
20from . import presubmit, presubmit_context
21
22# List borrowed from Android:
23# https://source.android.com/setup/contribute/respectful-code
24# inclusive-language: disable
25NON_INCLUSIVE_WORDS = [
26    r'master',
27    r'slave',
28    r'red[-\s]?line',
29    r'(white|gr[ae]y|black)[-\s]*(list|hat)',
30    r'craz(y|ie)',
31    r'insane',
32    r'crip+led?',
33    r'sanity',
34    r'sane',
35    r'dummy',
36    r'grandfather',
37    r's?he',
38    r'his',
39    r'her',
40    r'm[ae]n[-\s]*in[-\s]*the[-\s]*middle',
41    r'mitm',
42    r'first[-\s]?class[-\s]?citizen',
43]
44# inclusive-language: enable
45
46# Test: master  # inclusive-language: ignore
47# Test: master
48
49
50def _process_inclusive_language(*words):
51    """Turn word list into one big regex with common inflections."""
52
53    if not words:
54        words = tuple(NON_INCLUSIVE_WORDS)
55
56    all_words = []
57    for entry in words:
58        if isinstance(entry, str):
59            all_words.append(entry)
60        elif isinstance(entry, (list, tuple)):
61            all_words.extend(entry)
62        all_words.extend(x for x in words)
63    all_words = tuple(all_words)
64
65    # Confirm each individual word compiles as a valid regex.
66    for word in all_words:
67        _ = re.compile(word)
68
69    word_boundary = (
70        r'(\b|_|(?<=[a-z])(?=[A-Z])|(?<=[0-9])(?=\w)|(?<=\w)(?=[0-9]))'
71    )
72
73    return re.compile(
74        r"({b})(?i:{w})(e?[sd]{b}|{b})".format(
75            w='|'.join(all_words), b=word_boundary
76        ),
77    )
78
79
80NON_INCLUSIVE_WORDS_REGEX = _process_inclusive_language()
81
82# If seen, ignore this line and the next.
83IGNORE = 'inclusive-language: ignore'
84
85# Ignore a whole section. Please do not change the order of these lines.
86DISABLE = 'inclusive-language: disable'
87ENABLE = 'inclusive-language: enable'
88
89
90@dataclasses.dataclass
91class PathMatch:
92    word: str
93
94    def __repr__(self):
95        return f'Found non-inclusive word "{self.word}" in file path'
96
97
98@dataclasses.dataclass
99class LineMatch:
100    line: int
101    word: str
102
103    def __repr__(self):
104        return f'Found non-inclusive word "{self.word}" on line {self.line}'
105
106
107def check_file(
108    path: Path,
109    found_words: dict[Path, list[PathMatch | LineMatch]],
110    words_regex: re.Pattern = NON_INCLUSIVE_WORDS_REGEX,
111    check_path: bool = True,
112    root: Path | None = None,
113):
114    """Check one file for non-inclusive language.
115
116    Args:
117        path: File to check.
118        found_words: Output. Data structure where found words are added.
119        words_regex: Pattern of non-inclusive terms.
120        check_path: Whether to check the path instead of just the contents.
121            (Used for testing.)
122        root: Path to add as a prefix to path.
123    """
124    if check_path:
125        match = words_regex.search(str(path))
126        if match:
127            found_words.setdefault(path, [])
128            found_words[path].append(PathMatch(match.group(0)))
129
130    if path.is_symlink() or path.is_dir():
131        return
132
133    try:
134        if root:
135            path = root / path
136
137        with open(path, 'r') as ins:
138            enabled = True
139            prev = ''
140            for i, line in enumerate(ins, start=1):
141                if DISABLE in line:
142                    enabled = False
143                if ENABLE in line:
144                    enabled = True
145
146                # If we see the ignore line on this or the previous line we
147                # ignore any bad words on this line.
148                ignored = IGNORE in prev or IGNORE in line
149
150                if enabled and not ignored:
151                    match = words_regex.search(line)
152
153                    if match:
154                        found_words.setdefault(path, [])
155                        found_words[path].append(LineMatch(i, match.group(0)))
156
157                # Not using 'continue' so this line always executes.
158                prev = line
159
160    except UnicodeDecodeError:
161        # File is not text, like a gif.
162        pass
163
164
165@presubmit.check(name='inclusive_language')
166def presubmit_check(
167    ctx: presubmit_context.PresubmitContext,
168    words_regex=NON_INCLUSIVE_WORDS_REGEX,
169):
170    """Presubmit check that ensures files do not contain banned words."""
171
172    # No subprocesses are run for inclusive_language so don't perform this check
173    # if dry_run is on.
174    if ctx.dry_run:
175        return
176
177    found_words: dict[Path, list[PathMatch | LineMatch]] = {}
178
179    ctx.paths = presubmit_context.apply_exclusions(ctx)
180
181    for path in ctx.paths:
182        check_file(
183            path.relative_to(ctx.root),
184            found_words,
185            words_regex,
186            root=ctx.root,
187        )
188
189    if found_words:
190        with open(ctx.failure_summary_log, 'w') as outs:
191            for i, (path, matches) in enumerate(found_words.items()):
192                if i:
193                    print('=' * 40, file=outs)
194                print(path, file=outs)
195                for match in matches:
196                    print(match, file=outs)
197
198        print(ctx.failure_summary_log.read_text(), end=None)
199
200        print()
201        print(
202            """
203Individual lines can be ignored with "inclusive-language: ignore". Blocks can be
204ignored with "inclusive-language: disable" and reenabled with
205"inclusive-language: enable".
206""".strip()
207        )
208        # Re-enable just in case: inclusive-language: enable.
209
210        raise presubmit_context.PresubmitFailure
211
212
213def inclusive_language_checker(*words):
214    """Create banned words checker for the given list of banned words."""
215
216    regex = _process_inclusive_language(*words)
217
218    def inclusive_language(  # pylint: disable=redefined-outer-name
219        ctx: presubmit_context.PresubmitContext,
220    ):
221        globals()['inclusive_language'](ctx, regex)
222
223    return inclusive_language
224