xref: /aosp_15_r20/external/cronet/build/android/gyp/util/md5_check.py (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1# Copyright 2013 The Chromium Authors
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5
6import difflib
7import hashlib
8import itertools
9import json
10import os
11import sys
12import zipfile
13
14from util import build_utils
15import action_helpers  # build_utils adds //build to sys.path.
16import print_python_deps
17
18# When set and a difference is detected, a diff of what changed is printed.
19PRINT_EXPLANATIONS = int(os.environ.get('PRINT_BUILD_EXPLANATIONS', 0))
20
21# An escape hatch that causes all targets to be rebuilt.
22_FORCE_REBUILD = int(os.environ.get('FORCE_REBUILD', 0))
23
24
25def CallAndWriteDepfileIfStale(on_stale_md5,
26                               options,
27                               record_path=None,
28                               input_paths=None,
29                               input_strings=None,
30                               output_paths=None,
31                               force=False,
32                               pass_changes=False,
33                               track_subpaths_allowlist=None,
34                               depfile_deps=None):
35  """Wraps CallAndRecordIfStale() and writes a depfile if applicable.
36
37  Depfiles are automatically added to output_paths when present in the |options|
38  argument. They are then created after |on_stale_md5| is called.
39
40  By default, only python dependencies are added to the depfile. If there are
41  other input paths that are not captured by GN deps, then they should be listed
42  in depfile_deps. It's important to write paths to the depfile that are already
43  captured by GN deps since GN args can cause GN deps to change, and such
44  changes are not immediately reflected in depfiles (http://crbug.com/589311).
45  """
46  if not output_paths:
47    raise Exception('At least one output_path must be specified.')
48  input_paths = list(input_paths or [])
49  input_strings = list(input_strings or [])
50  output_paths = list(output_paths or [])
51
52  input_paths += print_python_deps.ComputePythonDependencies()
53
54  CallAndRecordIfStale(
55      on_stale_md5,
56      record_path=record_path,
57      input_paths=input_paths,
58      input_strings=input_strings,
59      output_paths=output_paths,
60      force=force,
61      pass_changes=pass_changes,
62      track_subpaths_allowlist=track_subpaths_allowlist)
63
64  # Write depfile even when inputs have not changed to ensure build correctness
65  # on bots that build with & without patch, and the patch changes the depfile
66  # location.
67  if hasattr(options, 'depfile') and options.depfile:
68    action_helpers.write_depfile(options.depfile, output_paths[0], depfile_deps)
69
70
71def CallAndRecordIfStale(function,
72                         record_path=None,
73                         input_paths=None,
74                         input_strings=None,
75                         output_paths=None,
76                         force=False,
77                         pass_changes=False,
78                         track_subpaths_allowlist=None):
79  """Calls function if outputs are stale.
80
81  Outputs are considered stale if:
82  - any output_paths are missing, or
83  - the contents of any file within input_paths has changed, or
84  - the contents of input_strings has changed.
85
86  To debug which files are out-of-date, set the environment variable:
87      PRINT_MD5_DIFFS=1
88
89  Args:
90    function: The function to call.
91    record_path: Path to record metadata.
92      Defaults to output_paths[0] + '.md5.stamp'
93    input_paths: List of paths to calcualte an md5 sum on.
94    input_strings: List of strings to record verbatim.
95    output_paths: List of output paths.
96    force: Whether to treat outputs as missing regardless of whether they
97      actually are.
98    pass_changes: Whether to pass a Changes instance to |function|.
99    track_subpaths_allowlist: Relevant only when pass_changes=True. List of .zip
100      files from |input_paths| to make subpath information available for.
101  """
102  assert record_path or output_paths
103  input_paths = input_paths or []
104  input_strings = input_strings or []
105  output_paths = output_paths or []
106  record_path = record_path or output_paths[0] + '.md5.stamp'
107
108  assert record_path.endswith('.stamp'), (
109      'record paths must end in \'.stamp\' so that they are easy to find '
110      'and delete')
111
112  new_metadata = _Metadata(track_entries=pass_changes or PRINT_EXPLANATIONS)
113  new_metadata.AddStrings(input_strings)
114
115  zip_allowlist = set(track_subpaths_allowlist or [])
116  for path in input_paths:
117    # It's faster to md5 an entire zip file than it is to just locate & hash
118    # its central directory (which is what this used to do).
119    if path in zip_allowlist:
120      entries = _ExtractZipEntries(path)
121      new_metadata.AddZipFile(path, entries)
122    else:
123      new_metadata.AddFile(path, _ComputeTagForPath(path))
124
125  old_metadata = None
126  force = force or _FORCE_REBUILD
127  missing_outputs = [x for x in output_paths if force or not os.path.exists(x)]
128  too_new = []
129  # When outputs are missing, don't bother gathering change information.
130  if not missing_outputs and os.path.exists(record_path):
131    record_mtime = os.path.getmtime(record_path)
132    # Outputs newer than the change information must have been modified outside
133    # of the build, and should be considered stale.
134    too_new = [x for x in output_paths if os.path.getmtime(x) > record_mtime]
135    if not too_new:
136      with open(record_path, 'r') as jsonfile:
137        try:
138          old_metadata = _Metadata.FromFile(jsonfile)
139        except:  # pylint: disable=bare-except
140          pass  # Not yet using new file format.
141
142  changes = Changes(old_metadata, new_metadata, force, missing_outputs, too_new)
143  if not changes.HasChanges():
144    return
145
146  if PRINT_EXPLANATIONS:
147    print('=' * 80)
148    print('Target is stale: %s' % record_path)
149    print(changes.DescribeDifference())
150    print('=' * 80)
151
152  args = (changes,) if pass_changes else ()
153  function(*args)
154
155  with open(record_path, 'w') as f:
156    new_metadata.ToFile(f)
157
158
159class Changes:
160  """Provides and API for querying what changed between runs."""
161
162  def __init__(self, old_metadata, new_metadata, force, missing_outputs,
163               too_new):
164    self.old_metadata = old_metadata
165    self.new_metadata = new_metadata
166    self.force = force
167    self.missing_outputs = missing_outputs
168    self.too_new = too_new
169
170  def _GetOldTag(self, path, subpath=None):
171    return self.old_metadata and self.old_metadata.GetTag(path, subpath)
172
173  def HasChanges(self):
174    """Returns whether any changes exist."""
175    return (self.HasStringChanges()
176            or self.old_metadata.FilesMd5() != self.new_metadata.FilesMd5())
177
178  def HasStringChanges(self):
179    """Returns whether string metadata changed."""
180    return (self.force or not self.old_metadata
181            or self.old_metadata.StringsMd5() != self.new_metadata.StringsMd5())
182
183  def AddedOrModifiedOnly(self):
184    """Returns whether the only changes were from added or modified (sub)files.
185
186    No missing outputs, no removed paths/subpaths.
187    """
188    if self.HasStringChanges():
189      return False
190    if any(self.IterRemovedPaths()):
191      return False
192    for path in self.IterModifiedPaths():
193      if any(self.IterRemovedSubpaths(path)):
194        return False
195    return True
196
197  def IterAllPaths(self):
198    """Generator for paths."""
199    return self.new_metadata.IterPaths();
200
201  def IterAllSubpaths(self, path):
202    """Generator for subpaths."""
203    return self.new_metadata.IterSubpaths(path);
204
205  def IterAddedPaths(self):
206    """Generator for paths that were added."""
207    for path in self.new_metadata.IterPaths():
208      if self._GetOldTag(path) is None:
209        yield path
210
211  def IterAddedSubpaths(self, path):
212    """Generator for paths that were added within the given zip file."""
213    for subpath in self.new_metadata.IterSubpaths(path):
214      if self._GetOldTag(path, subpath) is None:
215        yield subpath
216
217  def IterRemovedPaths(self):
218    """Generator for paths that were removed."""
219    if self.old_metadata:
220      for path in self.old_metadata.IterPaths():
221        if self.new_metadata.GetTag(path) is None:
222          yield path
223
224  def IterRemovedSubpaths(self, path):
225    """Generator for paths that were removed within the given zip file."""
226    if self.old_metadata:
227      for subpath in self.old_metadata.IterSubpaths(path):
228        if self.new_metadata.GetTag(path, subpath) is None:
229          yield subpath
230
231  def IterModifiedPaths(self):
232    """Generator for paths whose contents have changed."""
233    for path in self.new_metadata.IterPaths():
234      old_tag = self._GetOldTag(path)
235      new_tag = self.new_metadata.GetTag(path)
236      if old_tag is not None and old_tag != new_tag:
237        yield path
238
239  def IterModifiedSubpaths(self, path):
240    """Generator for paths within a zip file whose contents have changed."""
241    for subpath in self.new_metadata.IterSubpaths(path):
242      old_tag = self._GetOldTag(path, subpath)
243      new_tag = self.new_metadata.GetTag(path, subpath)
244      if old_tag is not None and old_tag != new_tag:
245        yield subpath
246
247  def IterChangedPaths(self):
248    """Generator for all changed paths (added/removed/modified)."""
249    return itertools.chain(self.IterRemovedPaths(),
250                           self.IterModifiedPaths(),
251                           self.IterAddedPaths())
252
253  def IterChangedSubpaths(self, path):
254    """Generator for paths within a zip that were added/removed/modified."""
255    return itertools.chain(self.IterRemovedSubpaths(path),
256                           self.IterModifiedSubpaths(path),
257                           self.IterAddedSubpaths(path))
258
259  def DescribeDifference(self):
260    """Returns a human-readable description of what changed."""
261    if self.force:
262      return 'force=True'
263    if self.missing_outputs:
264      return 'Outputs do not exist:\n  ' + '\n  '.join(self.missing_outputs)
265    if self.too_new:
266      return 'Outputs newer than stamp file:\n  ' + '\n  '.join(self.too_new)
267    if self.old_metadata is None:
268      return 'Previous stamp file not found.'
269
270    if self.old_metadata.StringsMd5() != self.new_metadata.StringsMd5():
271      ndiff = difflib.ndiff(self.old_metadata.GetStrings(),
272                            self.new_metadata.GetStrings())
273      changed = [s for s in ndiff if not s.startswith(' ')]
274      return 'Input strings changed:\n  ' + '\n  '.join(changed)
275
276    if self.old_metadata.FilesMd5() == self.new_metadata.FilesMd5():
277      return "There's no difference."
278
279    lines = []
280    lines.extend('Added: ' + p for p in self.IterAddedPaths())
281    lines.extend('Removed: ' + p for p in self.IterRemovedPaths())
282    for path in self.IterModifiedPaths():
283      lines.append('Modified: ' + path)
284      lines.extend('  -> Subpath added: ' + p
285                   for p in self.IterAddedSubpaths(path))
286      lines.extend('  -> Subpath removed: ' + p
287                   for p in self.IterRemovedSubpaths(path))
288      lines.extend('  -> Subpath modified: ' + p
289                   for p in self.IterModifiedSubpaths(path))
290    if lines:
291      return 'Input files changed:\n  ' + '\n  '.join(lines)
292    return 'I have no idea what changed (there is a bug).'
293
294
295class _Metadata:
296  """Data model for tracking change metadata.
297
298  Args:
299    track_entries: Enables per-file change tracking. Slower, but required for
300        Changes functionality.
301  """
302  # Schema:
303  # {
304  #   "files-md5": "VALUE",
305  #   "strings-md5": "VALUE",
306  #   "input-files": [
307  #     {
308  #       "path": "path.jar",
309  #       "tag": "{MD5 of entries}",
310  #       "entries": [
311  #         { "path": "org/chromium/base/Foo.class", "tag": "{CRC32}" }, ...
312  #       ]
313  #     }, {
314  #       "path": "path.txt",
315  #       "tag": "{MD5}",
316  #     }
317  #   ],
318  #   "input-strings": ["a", "b", ...],
319  # }
320  def __init__(self, track_entries=False):
321    self._track_entries = track_entries
322    self._files_md5 = None
323    self._strings_md5 = None
324    self._files = []
325    self._strings = []
326    # Map of (path, subpath) -> entry. Created upon first call to _GetEntry().
327    self._file_map = None
328
329  @classmethod
330  def FromFile(cls, fileobj):
331    """Returns a _Metadata initialized from a file object."""
332    ret = cls()
333    obj = json.load(fileobj)
334    ret._files_md5 = obj['files-md5']
335    ret._strings_md5 = obj['strings-md5']
336    ret._files = obj.get('input-files', [])
337    ret._strings = obj.get('input-strings', [])
338    return ret
339
340  def ToFile(self, fileobj):
341    """Serializes metadata to the given file object."""
342    obj = {
343        'files-md5': self.FilesMd5(),
344        'strings-md5': self.StringsMd5(),
345    }
346    if self._track_entries:
347      obj['input-files'] = sorted(self._files, key=lambda e: e['path'])
348      obj['input-strings'] = self._strings
349
350    json.dump(obj, fileobj, indent=2)
351
352  def _AssertNotQueried(self):
353    assert self._files_md5 is None
354    assert self._strings_md5 is None
355    assert self._file_map is None
356
357  def AddStrings(self, values):
358    self._AssertNotQueried()
359    self._strings.extend(str(v) for v in values)
360
361  def AddFile(self, path, tag):
362    """Adds metadata for a non-zip file.
363
364    Args:
365      path: Path to the file.
366      tag: A short string representative of the file contents.
367    """
368    self._AssertNotQueried()
369    self._files.append({
370        'path': path,
371        'tag': tag,
372    })
373
374  def AddZipFile(self, path, entries):
375    """Adds metadata for a zip file.
376
377    Args:
378      path: Path to the file.
379      entries: List of (subpath, tag) tuples for entries within the zip.
380    """
381    self._AssertNotQueried()
382    tag = _ComputeInlineMd5(itertools.chain((e[0] for e in entries),
383                                            (e[1] for e in entries)))
384    self._files.append({
385        'path': path,
386        'tag': tag,
387        'entries': [{"path": e[0], "tag": e[1]} for e in entries],
388    })
389
390  def GetStrings(self):
391    """Returns the list of input strings."""
392    return self._strings
393
394  def FilesMd5(self):
395    """Lazily computes and returns the aggregate md5 of input files."""
396    if self._files_md5 is None:
397      # Omit paths from md5 since temporary files have random names.
398      self._files_md5 = _ComputeInlineMd5(
399          self.GetTag(p) for p in sorted(self.IterPaths()))
400    return self._files_md5
401
402  def StringsMd5(self):
403    """Lazily computes and returns the aggregate md5 of input strings."""
404    if self._strings_md5 is None:
405      self._strings_md5 = _ComputeInlineMd5(self._strings)
406    return self._strings_md5
407
408  def _GetEntry(self, path, subpath=None):
409    """Returns the JSON entry for the given path / subpath."""
410    if self._file_map is None:
411      self._file_map = {}
412      for entry in self._files:
413        self._file_map[(entry['path'], None)] = entry
414        for subentry in entry.get('entries', ()):
415          self._file_map[(entry['path'], subentry['path'])] = subentry
416    return self._file_map.get((path, subpath))
417
418  def GetTag(self, path, subpath=None):
419    """Returns the tag for the given path / subpath."""
420    ret = self._GetEntry(path, subpath)
421    return ret and ret['tag']
422
423  def IterPaths(self):
424    """Returns a generator for all top-level paths."""
425    return (e['path'] for e in self._files)
426
427  def IterSubpaths(self, path):
428    """Returns a generator for all subpaths in the given zip.
429
430    If the given path is not a zip file or doesn't exist, returns an empty
431    iterable.
432    """
433    outer_entry = self._GetEntry(path)
434    if not outer_entry:
435      return ()
436    subentries = outer_entry.get('entries', [])
437    return (entry['path'] for entry in subentries)
438
439
440def _ComputeTagForPath(path):
441  stat = os.stat(path)
442  if stat.st_size > 1 * 1024 * 1024:
443    # Fallback to mtime for large files so that md5_check does not take too long
444    # to run.
445    return stat.st_mtime
446  md5 = hashlib.md5()
447  with open(path, 'rb') as f:
448    md5.update(f.read())
449  return md5.hexdigest()
450
451
452def _ComputeInlineMd5(iterable):
453  """Computes the md5 of the concatenated parameters."""
454  md5 = hashlib.md5()
455  for item in iterable:
456    md5.update(str(item).encode('ascii'))
457  return md5.hexdigest()
458
459
460def _ExtractZipEntries(path):
461  """Returns a list of (path, CRC32) of all files within |path|."""
462  entries = []
463  with zipfile.ZipFile(path) as zip_file:
464    for zip_info in zip_file.infolist():
465      # Skip directories and empty files.
466      if zip_info.CRC:
467        entries.append(
468            (zip_info.filename, zip_info.CRC + zip_info.compress_type))
469  return entries
470