xref: /aosp_15_r20/external/toolchain-utils/llvm_tools/git_llvm_rev.py (revision 760c253c1ed00ce9abd48f8546f08516e57485fe)
1#!/usr/bin/env python3
2# Copyright 2019 The ChromiumOS Authors
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Maps LLVM git SHAs to synthetic revision numbers and back.
7
8Revision numbers are all of the form '(branch_name, r1234)'. As a shorthand,
9r1234 is parsed as '(main, 1234)'.
10"""
11
12import argparse
13from pathlib import Path
14import re
15import subprocess
16import sys
17from typing import IO, Iterable, List, NamedTuple, Optional, Tuple, Union
18
19
20MAIN_BRANCH = "main"
21
22# Note that after base_llvm_sha, we reach The Wild West(TM) of commits.
23# So reasonable input that could break us includes:
24#
25#   Revert foo
26#
27#   This reverts foo, which had the commit message:
28#
29#   bar
30#   llvm-svn: 375505
31#
32# While saddening, this is something we should probably try to handle
33# reasonably.
34base_llvm_revision = 375505
35base_llvm_sha = "186155b89c2d2a2f62337081e3ca15f676c9434b"
36
37# Known pairs of [revision, SHA] in ascending order.
38# The first element is the first non-`llvm-svn` commit that exists. Later ones
39# are functional nops, but speed this script up immensely, since `git` can take
40# quite a while to walk >100K commits.
41known_llvm_rev_sha_pairs = (
42    (base_llvm_revision, base_llvm_sha),
43    (425000, "af870e11aed7a5c475ae41a72e3015c4c88597d1"),
44    (450000, "906ebd5830e6053b50c52bf098e3586b567e8499"),
45    (475000, "530d14a99611a71f8f3eb811920fd7b5c4d4e1f8"),
46    (500000, "173855f9b0bdfe45d71272596b510650bfc1ca33"),
47)
48
49# Represents an LLVM git checkout:
50#  - |dir| is the directory of the LLVM checkout
51#  - |remote| is the name of the LLVM remote. Generally it's "origin".
52LLVMConfig = NamedTuple(
53    "LLVMConfig", (("remote", str), ("dir", Union[Path, str]))
54)
55
56
57class Rev(NamedTuple("Rev", (("branch", str), ("number", int)))):
58    """Represents a LLVM 'revision', a shorthand identifies a LLVM commit."""
59
60    @staticmethod
61    def parse(rev: str) -> "Rev":
62        """Parses a Rev from the given string.
63
64        Raises a ValueError on a failed parse.
65        """
66        # Revs are parsed into (${branch_name}, r${commits_since_base_commit})
67        # pairs.
68        #
69        # We support r${commits_since_base_commit} as shorthand for
70        # (main, r${commits_since_base_commit}).
71        if rev.startswith("r"):
72            branch_name = MAIN_BRANCH
73            rev_string = rev[1:]
74        else:
75            match = re.match(r"\((.+), r(\d+)\)", rev)
76            if not match:
77                raise ValueError("%r isn't a valid revision" % rev)
78
79            branch_name, rev_string = match.groups()
80
81        return Rev(branch=branch_name, number=int(rev_string))
82
83    def __str__(self) -> str:
84        branch_name, number = self
85        if branch_name == MAIN_BRANCH:
86            return "r%d" % number
87        return "(%s, r%d)" % (branch_name, number)
88
89
90def is_git_sha(xs: str) -> bool:
91    """Returns whether the given string looks like a valid git commit SHA."""
92    return (
93        len(xs) > 6
94        and len(xs) <= 40
95        and all(x.isdigit() or "a" <= x.lower() <= "f" for x in xs)
96    )
97
98
99def check_output(command: List[str], cwd: Union[Path, str]) -> str:
100    """Shorthand for subprocess.check_output. Auto-decodes any stdout."""
101    result = subprocess.run(
102        command,
103        cwd=cwd,
104        check=True,
105        stdin=subprocess.DEVNULL,
106        stdout=subprocess.PIPE,
107        encoding="utf-8",
108    )
109    return result.stdout
110
111
112def translate_prebase_sha_to_rev_number(
113    llvm_config: LLVMConfig, sha: str
114) -> int:
115    """Translates a sha to a revision number (e.g., "llvm-svn: 1234").
116
117    This function assumes that the given SHA is an ancestor of |base_llvm_sha|.
118    """
119    commit_message = check_output(
120        ["git", "log", "-n1", "--format=%B", sha, "--"],
121        cwd=llvm_config.dir,
122    )
123    last_line = commit_message.strip().splitlines()[-1]
124    svn_match = re.match(r"^llvm-svn: (\d+)$", last_line)
125
126    if not svn_match:
127        raise ValueError(
128            f"No llvm-svn line found for {sha}, which... shouldn't happen?"
129        )
130
131    return int(svn_match.group(1))
132
133
134def translate_sha_to_rev(llvm_config: LLVMConfig, sha_or_ref: str) -> Rev:
135    """Translates a sha or git ref to a Rev."""
136
137    if is_git_sha(sha_or_ref):
138        sha = sha_or_ref
139    else:
140        sha = check_output(
141            ["git", "rev-parse", "--revs-only", sha_or_ref, "--"],
142            cwd=llvm_config.dir,
143        )
144        sha = sha.strip()
145
146    for base_rev, base_sha in reversed(known_llvm_rev_sha_pairs):
147        merge_base = check_output(
148            ["git", "merge-base", base_sha, sha, "--"],
149            cwd=llvm_config.dir,
150        )
151        merge_base = merge_base.strip()
152        if merge_base == base_sha:
153            result = check_output(
154                [
155                    "git",
156                    "rev-list",
157                    "--count",
158                    "--first-parent",
159                    f"{base_sha}..{sha}",
160                    "--",
161                ],
162                cwd=llvm_config.dir,
163            )
164            count = int(result.strip())
165            return Rev(branch=MAIN_BRANCH, number=count + base_rev)
166
167    # Otherwise, either:
168    # - |merge_base| is |sha| (we have a guaranteed llvm-svn number on |sha|)
169    # - |merge_base| is neither (we have a guaranteed llvm-svn number on
170    #                            |merge_base|, but not |sha|)
171    merge_base_number = translate_prebase_sha_to_rev_number(
172        llvm_config, merge_base
173    )
174    if merge_base == sha:
175        return Rev(branch=MAIN_BRANCH, number=merge_base_number)
176
177    distance_from_base = check_output(
178        [
179            "git",
180            "rev-list",
181            "--count",
182            "--first-parent",
183            f"{merge_base}..{sha}",
184            "--",
185        ],
186        cwd=llvm_config.dir,
187    )
188
189    revision_number = merge_base_number + int(distance_from_base.strip())
190    branches_containing = check_output(
191        ["git", "branch", "-r", "--contains", sha],
192        cwd=llvm_config.dir,
193    )
194
195    candidates = []
196
197    prefix = llvm_config.remote + "/"
198    for branch in branches_containing.splitlines():
199        branch = branch.strip()
200        if branch.startswith(prefix):
201            candidates.append(branch[len(prefix) :])
202
203    if not candidates:
204        raise ValueError(
205            f"No viable branches found from {llvm_config.remote} with {sha}"
206        )
207
208    # It seems that some `origin/release/.*` branches have
209    # `origin/upstream/release/.*` equivalents, which is... awkward to deal
210    # with. Prefer the latter, since that seems to have newer commits than the
211    # former. Technically n^2, but len(elements) should be like, tens in the
212    # worst case.
213    candidates = [x for x in candidates if f"upstream/{x}" not in candidates]
214    if len(candidates) != 1:
215        raise ValueError(
216            f"Ambiguity: multiple branches from {llvm_config.remote} have "
217            f"{sha}: {sorted(candidates)}"
218        )
219
220    return Rev(branch=candidates[0], number=revision_number)
221
222
223def parse_git_commit_messages(
224    stream: Union[Iterable[str], IO[str]], separator: str
225) -> Iterable[Tuple[str, str]]:
226    """Parses a stream of git log messages.
227
228    These are expected to be in the format:
229
230    40 character sha
231    commit
232    message
233    body
234    separator
235    40 character sha
236    commit
237    message
238    body
239    separator
240    """
241
242    lines = iter(stream)
243    while True:
244        # Looks like a potential bug in pylint? crbug.com/1041148
245        # pylint: disable=stop-iteration-return
246        sha = next(lines, None)
247        if sha is None:
248            return
249
250        sha = sha.strip()
251        assert is_git_sha(sha), f"Invalid git SHA: {sha}"
252
253        message = []
254        for line in lines:
255            if line.strip() == separator:
256                break
257            message.append(line)
258
259        yield sha, "".join(message)
260
261
262def translate_prebase_rev_to_sha(llvm_config: LLVMConfig, rev: Rev) -> str:
263    """Translates a Rev to a SHA.
264
265    This function assumes that the given rev refers to a commit that's an
266    ancestor of |base_llvm_sha|.
267    """
268    # Because reverts may include reverted commit messages, we can't just |-n1|
269    # and pick that.
270    separator = ">!" * 80
271    looking_for = f"llvm-svn: {rev.number}"
272
273    git_command = [
274        "git",
275        "log",
276        "--grep",
277        f"^{looking_for}$",
278        f"--format=%H%n%B{separator}",
279        base_llvm_sha,
280    ]
281
282    with subprocess.Popen(
283        git_command,
284        cwd=llvm_config.dir,
285        stdin=subprocess.DEVNULL,
286        stdout=subprocess.PIPE,
287        encoding="utf-8",
288    ) as subp:
289        assert subp.stdout is not None
290        for sha, message in parse_git_commit_messages(subp.stdout, separator):
291            last_line = message.splitlines()[-1]
292            if last_line.strip() == looking_for:
293                subp.terminate()
294                return sha
295        if subp.wait() != 0:
296            raise subprocess.CalledProcessError(subp.returncode, git_command)
297
298    raise ValueError(f"No commit with revision {rev} found")
299
300
301def translate_rev_to_sha_from_baseline(
302    llvm_config: LLVMConfig,
303    parent_sha: str,
304    parent_rev: int,
305    child_sha: str,
306    child_rev: Optional[int],
307    want_rev: int,
308    branch_name: str,
309) -> str:
310    """Translates a revision number between a parent & child to a SHA.
311
312    Args:
313        llvm_config: LLVM config to use.
314        parent_sha: SHA of the parent that the revision number is a child of.
315        parent_rev: Revision number of `parent_sha`.
316        child_sha: A child of `parent_sha` to find a rev on.
317        child_rev: Optional note of what the child's revision number is.
318        want_rev: The desired revision number between child and parent.
319        branch_name: Name of the branch to refer to if a ValueError is raised.
320
321    Raises:
322        ValueError if the given child isn't far enough away from the parent to
323        find `want_rev`.
324    """
325    # As a convenience, have a fast path for want_rev < parent_rev. In
326    # particular, branches can hit this case.
327    if want_rev < parent_rev:
328        baseline_git_sha = parent_sha
329        commits_behind_baseline = parent_rev - want_rev
330    else:
331        if child_rev is None:
332            commits_between_parent_and_child = check_output(
333                [
334                    "git",
335                    "rev-list",
336                    "--count",
337                    "--first-parent",
338                    f"{parent_sha}..{child_sha}",
339                    "--",
340                ],
341                cwd=llvm_config.dir,
342            )
343            child_rev = parent_rev + int(
344                commits_between_parent_and_child.strip()
345            )
346        if child_rev < want_rev:
347            raise ValueError(
348                "Revision {want_rev} is past "
349                f"{llvm_config.remote}/{branch_name}. Try updating your tree?"
350            )
351        baseline_git_sha = child_sha
352        commits_behind_baseline = child_rev - want_rev
353
354    if not commits_behind_baseline:
355        return baseline_git_sha
356
357    result = check_output(
358        [
359            "git",
360            "rev-parse",
361            "--revs-only",
362            f"{baseline_git_sha}~{commits_behind_baseline}",
363        ],
364        cwd=llvm_config.dir,
365    )
366    return result.strip()
367
368
369def translate_rev_to_sha(llvm_config: LLVMConfig, rev: Rev) -> str:
370    """Translates a Rev to a SHA.
371
372    Raises a ValueError if the given Rev doesn't exist in the given config.
373    """
374    branch, number = rev
375
376    branch_tip = check_output(
377        ["git", "rev-parse", "--revs-only", f"{llvm_config.remote}/{branch}"],
378        cwd=llvm_config.dir,
379    ).strip()
380
381    if branch != MAIN_BRANCH:
382        main_merge_point = check_output(
383            [
384                "git",
385                "merge-base",
386                f"{llvm_config.remote}/{MAIN_BRANCH}",
387                branch_tip,
388            ],
389            cwd=llvm_config.dir,
390        )
391        main_merge_point = main_merge_point.strip()
392        main_rev = translate_sha_to_rev(llvm_config, main_merge_point)
393        return translate_rev_to_sha_from_baseline(
394            llvm_config,
395            parent_sha=main_merge_point,
396            parent_rev=main_rev.number,
397            child_sha=branch_tip,
398            child_rev=None,
399            want_rev=number,
400            branch_name=branch,
401        )
402
403    if number < base_llvm_revision:
404        return translate_prebase_rev_to_sha(llvm_config, rev)
405
406    # Technically this could be a binary search, but the list has fewer than 10
407    # elems, and won't grow fast. Linear is easier.
408    last_cached_rev = None
409    last_cached_sha = branch_tip
410    for cached_rev, cached_sha in reversed(known_llvm_rev_sha_pairs):
411        if cached_rev == number:
412            return cached_sha
413
414        if cached_rev < number:
415            return translate_rev_to_sha_from_baseline(
416                llvm_config,
417                parent_sha=cached_sha,
418                parent_rev=cached_rev,
419                child_sha=last_cached_sha,
420                child_rev=last_cached_rev,
421                want_rev=number,
422                branch_name=branch,
423            )
424
425        last_cached_rev = cached_rev
426        last_cached_sha = cached_sha
427
428    # This is only hit if `number >= base_llvm_revision` _and_ there's no
429    # coverage for `number` in `known_llvm_rev_sha_pairs`, which contains
430    # `base_llvm_revision`.
431    assert False, "Couldn't find a base SHA for a rev on main?"
432
433
434def find_root_llvm_dir(root_dir: str = ".") -> str:
435    """Finds the root of an LLVM directory starting at |root_dir|.
436
437    Raises a subprocess.CalledProcessError if no git directory is found.
438    """
439    result = check_output(
440        ["git", "rev-parse", "--show-toplevel"],
441        cwd=root_dir,
442    )
443    return result.strip()
444
445
446def main(argv: List[str]) -> None:
447    parser = argparse.ArgumentParser(description=__doc__)
448    parser.add_argument(
449        "--llvm_dir",
450        help="LLVM directory to consult for git history, etc. Autodetected "
451        "if cwd is inside of an LLVM tree",
452    )
453    parser.add_argument(
454        "--upstream",
455        default="origin",
456        help="LLVM upstream's remote name. Defaults to %(default)s.",
457    )
458    sha_or_rev = parser.add_mutually_exclusive_group(required=True)
459    sha_or_rev.add_argument(
460        "--sha", help="A git SHA (or ref) to convert to a rev"
461    )
462    sha_or_rev.add_argument("--rev", help="A rev to convert into a sha")
463    opts = parser.parse_args(argv)
464
465    llvm_dir = opts.llvm_dir
466    if llvm_dir is None:
467        try:
468            llvm_dir = find_root_llvm_dir()
469        except subprocess.CalledProcessError:
470            parser.error(
471                "Couldn't autodetect an LLVM tree; please use --llvm_dir"
472            )
473
474    config = LLVMConfig(
475        remote=opts.upstream,
476        dir=opts.llvm_dir or find_root_llvm_dir(),
477    )
478
479    if opts.sha:
480        rev = translate_sha_to_rev(config, opts.sha)
481        print(rev)
482    else:
483        sha = translate_rev_to_sha(config, Rev.parse(opts.rev))
484        print(sha)
485
486
487if __name__ == "__main__":
488    main(sys.argv[1:])
489