xref: /aosp_15_r20/external/google-cloud-java/owl-bot-postprocessor/synthtool/metadata.py (revision 55e87721aa1bc457b326496a7ca40f3ea1a63287)
1# Copyright 2018 Google LLC
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7#     https://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14
15import fnmatch
16import locale
17import os
18import pathlib
19import shutil
20import subprocess
21import sys
22import tempfile
23import threading
24import time
25from typing import Dict, Iterable, List
26
27import google.protobuf.json_format
28import watchdog.events
29import watchdog.observers
30
31from synthtool.log import logger
32from synthtool.protos import metadata_pb2
33
34_metadata = metadata_pb2.Metadata()
35
36
37def get_environment_bool(var_name: str) -> bool:
38    val = os.environ.get(var_name)
39    return False if not val or val.lower() == "false" else True
40
41
42_track_obsolete_files = get_environment_bool("SYNTHTOOL_TRACK_OBSOLETE_FILES")
43
44# The list of file patterns excluded during a copy() or move() operation.
45_excluded_patterns: List[str] = []
46
47
48def reset() -> None:
49    """Clear all metadata so far."""
50    global _metadata
51    _metadata = metadata_pb2.Metadata()
52    global _excluded_patterns
53    _excluded_patterns = []
54
55
56def get():
57    return _metadata
58
59
60def add_git_source(**kwargs) -> None:
61    """Adds a git source to the current metadata."""
62    _metadata.sources.add(git=metadata_pb2.GitSource(**kwargs))
63
64
65def add_pattern_excluded_during_copy(glob_pattern: str) -> None:
66    """Adds a file excluded during copy.
67
68    Used to avoid deleting an obsolete file that is excluded."""
69    _excluded_patterns.append(glob_pattern)
70
71
72def add_generator_source(**kwargs) -> None:
73    """Adds a generator source to the current metadata."""
74    _metadata.sources.add(generator=metadata_pb2.GeneratorSource(**kwargs))
75
76
77def add_template_source(**kwargs) -> None:
78    """Adds a template source to the current metadata."""
79    _metadata.sources.add(template=metadata_pb2.TemplateSource(**kwargs))
80
81
82def add_client_destination(**kwargs) -> None:
83    """Adds a client library destination to the current metadata."""
84    _metadata.destinations.add(client=metadata_pb2.ClientDestination(**kwargs))
85
86
87def _git_slashes(path: str):
88    # git speaks only forward slashes
89    return path.replace("\\", "/") if sys.platform == "win32" else path
90
91
92def _read_or_empty(path: str = "synth.metadata"):
93    """Reads a metadata json file.  Returns empty if that file is not found."""
94    try:
95        with open(path, "rt") as file:
96            text = file.read()
97        return google.protobuf.json_format.Parse(text, metadata_pb2.Metadata())
98    except FileNotFoundError:
99        return metadata_pb2.Metadata()
100
101
102def write(outfile: str = "synth.metadata") -> None:
103    """Writes out the metadata to a file."""
104    jsonified = google.protobuf.json_format.MessageToJson(_metadata)
105
106    with open(outfile, "w") as fh:
107        fh.write(jsonified)
108
109    logger.debug(f"Wrote metadata to {outfile}.")
110
111
112def _remove_obsolete_files(old_metadata):
113    """Remove obsolete files from the file system.
114
115    Call add_new_files() before this function or it will remove all generated
116    files.
117
118    Parameters:
119        old_metadata:  old metadata loaded from a call to read_or_empty().
120    """
121    old_files = set(old_metadata.generated_files)
122    new_files = set(_metadata.generated_files)
123    excluded_patterns = set([pattern for pattern in _excluded_patterns])
124    obsolete_files = old_files - new_files
125    for file_path in git_ignore(obsolete_files):
126        try:
127            matched_pattern = False
128            for pattern in excluded_patterns:
129                if fnmatch.fnmatch(file_path, pattern):
130                    matched_pattern = True
131                    break
132            if matched_pattern:
133                logger.info(
134                    f"Leaving obsolete file {file_path} because it matched excluded pattern {pattern} during copy."
135                )
136            else:
137                logger.info(f"Removing obsolete file {file_path}...")
138                os.unlink(file_path)
139        except FileNotFoundError:
140            pass  # Already deleted.  That's OK.
141
142
143def git_ignore(file_paths: Iterable[str]):
144    """Returns a new list of the same files, with ignored files removed."""
145    # Surprisingly, git check-ignore doesn't ignore .git directories, take those
146    # files out manually.
147    nongit_file_paths = [
148        file_path
149        for file_path in file_paths
150        if ".git" not in pathlib.Path(file_path).parts
151    ]
152
153    encoding = locale.getpreferredencoding(False)
154    # Write the files to a temporary text file.
155    with tempfile.TemporaryFile("w+b") as f:
156        for file_path in nongit_file_paths:
157            f.write(_git_slashes(file_path).encode(encoding))
158            f.write("\n".encode(encoding))
159        # Invoke git.
160        f.seek(0)
161        git = shutil.which("git")
162        if not git:
163            raise FileNotFoundError("Could not find git in PATH.")
164        completed_process = subprocess.run(
165            [git, "check-ignore", "--stdin"], stdin=f, stdout=subprocess.PIPE
166        )
167    # Digest git output.
168    output_text = completed_process.stdout.decode(encoding)
169    ignored_file_paths = set(
170        [os.path.normpath(path.strip()) for path in output_text.split("\n")]
171    )
172    # Filter the ignored paths from the file_paths.
173    return [
174        path
175        for path in nongit_file_paths
176        if os.path.normpath(path) not in ignored_file_paths
177    ]
178
179
180def set_track_obsolete_files(track_obsolete_files=True):
181    """Instructs synthtool to track and remove obsolete files."""
182    global _track_obsolete_files
183    _track_obsolete_files = track_obsolete_files
184
185
186def should_track_obsolete_files():
187    return _track_obsolete_files
188
189
190class FileSystemEventHandler(watchdog.events.FileSystemEventHandler):
191    """Records all the files that were touched."""
192
193    def __init__(self, watch_dir: pathlib.Path):
194        super().__init__()
195        self._touched_file_paths: List[str] = list()
196        self._touched_lock = threading.Lock()
197        self._watch_dir = watch_dir
198
199    def on_any_event(self, event):
200        if event.is_directory:
201            return
202        if event.event_type in (
203            watchdog.events.EVENT_TYPE_MODIFIED,
204            watchdog.events.EVENT_TYPE_CREATED,
205        ):
206            touched_path = event.src_path
207        elif event.event_type == watchdog.events.EVENT_TYPE_MOVED:
208            touched_path = event.dest_path
209        else:
210            return
211        touched_path = pathlib.Path(touched_path).relative_to(self._watch_dir)
212        with self._touched_lock:
213            self._touched_file_paths.append(str(touched_path))
214
215    def get_touched_file_paths(self) -> List[str]:
216        # deduplicate and sort
217        with self._touched_lock:
218            paths = set(self._touched_file_paths)
219        result = list(paths)
220        result.sort()
221        return result
222
223
224class MetadataTrackerAndWriter:
225    """Writes metadata file upon exiting scope."""
226
227    def __init__(self, metadata_file_path: str):
228        self.metadata_file_path = metadata_file_path
229
230    def __enter__(self):
231        self.old_metadata = _read_or_empty(self.metadata_file_path)
232        _add_self_git_source()
233        watch_dir = pathlib.Path(self.metadata_file_path).parent
234        os.makedirs(watch_dir, exist_ok=True)
235        # Create an observer only if obsolete file tracking is enabled.
236        # This prevents inotify errors in synth jobs that may delete the watch
237        # dir. Such synth jobs should leave obsolete file tracking disabled.
238        if should_track_obsolete_files():
239            self.handler = FileSystemEventHandler(watch_dir)
240            self.observer = watchdog.observers.Observer()
241            self.observer.schedule(self.handler, str(watch_dir), recursive=True)
242            self.observer.start()
243
244    def __exit__(self, type, value, traceback):
245        if value:
246            pass  # An exception was raised.  Don't write metadata or clean up.
247        else:
248            if should_track_obsolete_files():
249                time.sleep(2)  # Finish collecting observations about modified files.
250                self.observer.stop()
251                self.observer.join()
252                for path in git_ignore(self.handler.get_touched_file_paths()):
253                    _metadata.generated_files.append(path)
254                _remove_obsolete_files(self.old_metadata)
255            _clear_local_paths(get())
256            _metadata.sources.sort(key=_source_key)
257            if _enable_write_metadata:
258                write(self.metadata_file_path)
259
260
261def _get_git_source_map(metadata) -> Dict[str, object]:
262    """Gets the git sources from the metadata.
263
264    Parameters:
265        metadata: an instance of metadata_pb2.Metadata.
266
267    Returns:
268        A dict mapping git source name to metadata_pb2.GitSource instance.
269    """
270    source_map = {}
271    for source in metadata.sources:
272        if source.HasField("git"):
273            git_source = source.git
274            source_map[git_source.name] = git_source
275    return source_map
276
277
278def _clear_local_paths(metadata):
279    """Clear the local_path from the git sources.
280
281    There's no reason to preserve it, and it may leak some info we don't
282    want to leak in the path.
283    """
284    for source in metadata.sources:
285        if source.HasField("git"):
286            git_source = source.git
287            git_source.ClearField("local_path")
288
289
290def _add_self_git_source():
291    """Adds current working directory as a git source.
292
293    Returns:
294        The number of git sources added to metadata.
295    """
296    # Use the repository's root directory name as the name.
297    return _add_git_source_from_directory(".", os.getcwd())
298
299
300def _add_git_source_from_directory(name: str, dir_path: str) -> int:
301    """Adds the git repo containing the directory as a git source.
302
303    Returns:
304        The number of git sources added to metadata.
305    """
306    completed_process = subprocess.run(
307        ["git", "-C", dir_path, "status"], universal_newlines=True
308    )
309    if completed_process.returncode:
310        logger.warning("%s is not directory in a git repo.", dir_path)
311        return 0
312    completed_process = subprocess.run(
313        ["git", "-C", dir_path, "remote", "get-url", "origin"],
314        stdout=subprocess.PIPE,
315        universal_newlines=True,
316    )
317    url = completed_process.stdout.strip()
318    completed_process = subprocess.run(
319        ["git", "-C", dir_path, "log", "--no-decorate", "-1", "--pretty=format:%H"],
320        stdout=subprocess.PIPE,
321        universal_newlines=True,
322    )
323    latest_sha = completed_process.stdout.strip()
324    add_git_source(name=name, remote=url, sha=latest_sha)
325    return 1
326
327
328def _source_key(source):
329    """Creates a key to use to sort a list of sources.
330
331    Arguments:
332        source {metadata_pb2.Source} -- the Source for which to formulate a sort key
333
334    Returns:
335        tuple -- A key to use to sort a list of sources.
336    """
337    if source.HasField("git"):
338        return ("git", source.git.name, source.git.remote, source.git.sha)
339    if source.HasField("generator"):
340        return (
341            "generator",
342            source.generator.name,
343            source.generator.version,
344            source.generator.docker_image,
345        )
346    if source.HasField("template"):
347        return (
348            "template",
349            source.template.name,
350            source.template.origin,
351            source.template.version,
352        )
353
354
355_enable_write_metadata = True
356
357
358def enable_write_metadata(enable: bool = True) -> None:
359    """Control whether synthtool writes synth.metadata file."""
360    global _enable_write_metadata
361    _enable_write_metadata = enable
362