xref: /aosp_15_r20/external/pigweed/pw_cli_analytics/py/pw_cli_analytics/analytics.py (revision 61c4878ac05f98d0ceed94b57d316916de578985)
1# Copyright 2024 The Pigweed Authors
2#
3# Licensed under the Apache License, Version 2.0 (the "License"); you may not
4# use this file except in compliance with the License. You may obtain a copy of
5# the License at
6#
7#     https://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
11# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
12# License for the specific language governing permissions and limitations under
13# the License.
14"""Report usage to Google Analytics."""
15
16import argparse
17import dataclasses
18import enum
19import getpass
20import json
21import logging
22import multiprocessing
23import os
24from pathlib import Path
25import platform
26import pprint
27import re
28import subprocess
29import sys
30from typing import Any, Sequence
31import time
32
33import requests
34
35import pw_env_setup.config_file
36
37from . import cli
38from . import config
39
40_LOG: logging.Logger = logging.getLogger(__name__)
41
42SAFE_SUBCOMMANDS_TO_ALWAYS_REPORT = (
43    'analytics',
44    'bloat',
45    'build',
46    'console',
47    'doctor',
48    'emu',
49    'format',
50    'heap-viewer',
51    'help',
52    'ide',
53    'keep-sorted',
54    'logdemo',
55    'module',
56    'package',
57    'presubmit',
58    'python-packages',
59    'requires',
60    'rpc',
61    'test',
62    'update',
63    'watch',
64)
65
66
67def _upstream_remote(cwd: Path | str | None = None) -> str | None:
68    """Return the remote for this branch, or None if there are errors."""
69    upstream = ''
70    prev_upstreams = [upstream]
71    while '/' not in upstream:
72        try:
73            upstream = (
74                subprocess.run(
75                    [
76                        'git',
77                        'rev-parse',
78                        '--abbrev-ref',
79                        '--symbolic-full-name',
80                        f'{upstream}@{{upstream}}',
81                    ],
82                    check=True,
83                    cwd=cwd,
84                    stdout=subprocess.PIPE,
85                    stderr=subprocess.DEVNULL,
86                )
87                .stdout.decode()
88                .strip()
89            )
90        except subprocess.CalledProcessError:
91            return None
92        prev_upstreams.append(upstream)
93
94        if upstream in prev_upstreams:
95            return None
96
97    remote = upstream.split('/')[0]
98    try:
99        url = (
100            subprocess.run(
101                ['git', 'config', '--get', f'remote.{remote}.url'],
102                check=True,
103                cwd=cwd,
104                stdout=subprocess.PIPE,
105                stderr=subprocess.DEVNULL,
106            )
107            .stdout.decode()
108            .strip()
109        )
110    except subprocess.CalledProcessError:
111        return None
112
113    # Don't return local paths.
114    if ':' not in url:
115        return None
116
117    return url
118
119
120def _fallback_remote(cwd: Path | str | None = None) -> str | None:
121    """Get all the remotes and use some heuristics to pick one of them."""
122    remotes = {}
123
124    try:
125        result = subprocess.run(
126            ['git', 'config', '--get-regexp', r'^remote\..*\.url$'],
127            check=True,
128            cwd=cwd,
129            stdout=subprocess.PIPE,
130            stderr=subprocess.DEVNULL,
131        )
132    except subprocess.CalledProcessError:
133        return None
134
135    for line in result.stdout.decode().strip().splitlines():
136        branch, remote = line.split()
137
138        # Exclude remotes
139        if ':' not in remote:
140            continue
141
142        if branch.startswith('remote.'):
143            branch = branch[len('remote.') :]
144        if branch.endswith('.url'):
145            branch = branch[: -len('.url')]
146        remotes[branch] = remote
147
148    # Prefer the remote named 'origin'.
149    if 'origin' in remotes:
150        return remotes['origin']
151
152    # Filter to the remotes that occur most often. Then pick one of those
153    # arbitrarily.
154    values = list(remotes.values())
155    return max(set(values), key=values.count)
156
157
158def _remote(cwd: Path | str | None = None) -> str | None:
159    url = _upstream_remote(cwd=cwd) or _fallback_remote(cwd=cwd)
160    if not url:
161        return None
162
163    # If there's a username in the url, remove it.
164    return re.sub(r'[\w_]+@', '<username>@', url)
165
166
167@dataclasses.dataclass
168class GerritCommit:
169    commit: str
170    change_id: int
171
172
173def _pigweed_commit(cwd: Path | str | None = None) -> GerritCommit | None:
174    """Find the most recently submitted Pigweed commit in use by this checkout.
175
176    Returns: Tuple of the Git commit hash and the Gerrit change id.
177    """
178    pw_root = cwd or os.environ.get('PW_ROOT')
179    if not pw_root:
180        return None
181
182    proc = subprocess.Popen(
183        ['git', 'log'],
184        cwd=pw_root,
185        stdout=subprocess.PIPE,
186        stderr=subprocess.DEVNULL,
187    )
188
189    commit = None
190    assert proc.stdout
191    for i, raw_line in enumerate(proc.stdout):
192        # If we get this far in we've spent too much time and should give up.
193        if i > 100000:
194            break
195
196        line = raw_line.decode(encoding='utf-8')
197        parts = line.split()
198        if len(parts) == 2 and parts[0] == 'commit':
199            commit = parts[1]
200            continue
201
202        pw_remote = 'https://pigweed-review.googlesource.com/c/pigweed/pigweed'
203        if line.strip().startswith(f'Reviewed-on: {pw_remote}'):
204            assert commit
205            change_id = int(line.split('/')[-1])
206            return GerritCommit(commit, change_id)
207
208    return None
209
210
211@dataclasses.dataclass
212class UserProperties:
213    luci_buildbucket_id: str = os.environ.get('BUILDBUCKET_ID', '')
214    luci_buildbucket_name: str = os.environ.get('BUILDBUCKET_NAME', '')
215    luci_swarming_id: str = os.environ.get('SWARMING_TASK_ID', '')
216    num_cores: int = multiprocessing.cpu_count()
217    cpu_architecture: str = platform.machine()
218    cpu_bits: int = 64 if sys.maxsize > 2**32 else 32
219    os_name: str = platform.system()
220    automated_build: bool = False
221
222    def __post_init__(self):
223        # TODO: b/319320838 - Figure out how to tell if running in TreeHugger.
224        if self.luci_buildbucket_id or self.luci_swarming_id:
225            self.automated_build = True
226
227
228@dataclasses.dataclass
229class Payload:
230    user_properties: UserProperties = dataclasses.field(
231        default_factory=UserProperties,
232    )
233    event_params: dict[str, int | str | None] = dataclasses.field(
234        default_factory=dict
235    )
236
237
238def remove_username(
239    arg: Path | str,
240    *,
241    username: str = getpass.getuser(),
242    home: str = os.path.expanduser('~'),
243) -> str:
244    """Remove usernames from Path-like objects.
245
246    Examples:
247        /home/user/foo/bar => ~/foo/bar
248        /home/user/foo/user/bar => ~/foo/$USERNAME/bar
249        /data/foo/bar => /data/foo/bar
250        --force => --force
251
252    Args:
253        arg: Possible Path-like object.
254        username: Username of current user.
255        home: Home directory of current user.
256    """
257    arg = str(arg)
258    if arg.startswith(home):
259        arg = f'$HOME{arg[len(home):]}'
260
261    return re.sub(rf'\b{username}\b', '$USERNAME', arg)
262
263
264class State(enum.Enum):
265    UNKNOWN = 0
266    ALREADY_INITIALIZED = 1
267    NEWLY_INITIALIZED = 2
268
269
270_INITIALIZE_STATE: State | None = None
271
272
273class Analytics:
274    """Report usage to Google Analytics."""
275
276    def __init__(
277        self,
278        argv: Sequence[str],
279        parsed_args: argparse.Namespace,
280        *args,
281        debug: bool = False,
282        **kwargs,
283    ):
284        """Initializes an Analytics object.
285
286        Args:
287            argv: List of arguments passed to this script.
288            parsed_args: The parsed representation of those arguments.
289            *args: Arguments to pass through to super().
290            debug: Whether to use the debug host.
291            **kwargs: Arguments to pass through to super().
292        """
293
294        super().__init__(*args, **kwargs)
295
296        self._argv: Sequence[str] = argv[:]
297        self._parsed_args: argparse.Namespace = parsed_args
298
299        self._debug: bool = debug
300        self._start_time: float | None = None
301
302        self.config = config.AnalyticsPrefs()
303
304        self._url: str = self.config['prod_url']
305        if debug:
306            self._url = self.config['debug_url']
307
308        self._enabled = True
309        if self.config['uuid'] is None or not self.config['enabled']:
310            self._enabled = False
311        elif self._parsed_args.command == 'analytics':
312            self._enabled = False
313        elif _INITIALIZE_STATE == State.NEWLY_INITIALIZED:
314            self._enabled = False
315
316    def _payload(self, cwd: Path) -> Payload:
317        """Create a Payload object.
318
319        Create a Payload object based on the system and the arguments to the
320        current script based on the config.
321
322        See https://pigweed.dev/pw_cli_analytics for details about these values.
323
324        Args:
325            cwd: Checkout directory.
326        """
327        payload = Payload()
328        payload.event_params['command'] = 'redacted'
329
330        rsn = self.config['report_subcommand_name']
331        if rsn == 'always':
332            payload.event_params['command'] = self._parsed_args.command
333
334        elif rsn == 'limited':
335            if self._parsed_args.command in SAFE_SUBCOMMANDS_TO_ALWAYS_REPORT:
336                payload.event_params['command'] = self._parsed_args.command
337
338        elif rsn == 'never':
339            pass
340
341        else:
342            raise ValueError(f'unexpected report_subcommand_name value {rsn!r}')
343
344        if self.config['report_command_line']:
345            if rsn != 'always':
346                raise ValueError(
347                    'report_command_line is True but report_subcommand_name '
348                    f'is {rsn!r}'
349                )
350            for i, arg in enumerate(self._argv):
351                if i >= 10:
352                    break
353                payload.event_params[f'argv_{i}'] = remove_username(arg)
354
355        if self.config['report_project_name']:
356            pw_config: dict[str, Any] = pw_env_setup.config_file.load().get(
357                'pw', {}
358            )
359            root_var = pw_config.get('pw_env_setup', {}).get('root_variable')
360            payload.event_params['project_root_var'] = root_var
361
362        if self.config['report_remote_url']:
363            payload.event_params['git_remote'] = _remote(cwd=cwd)
364
365        pw_commit = _pigweed_commit()
366        if pw_commit:
367            payload.event_params['pigweed_commit'] = pw_commit.commit
368            payload.event_params['pigweed_change_id'] = pw_commit.change_id
369
370        return payload
371
372    def _event(self, payload: Payload):
373        """Send a Payload to Google Analytics."""
374
375        if not self._enabled:
376            return
377
378        headers = {
379            'User-Agent': 'pw-command-line-tool',
380            'Content-Type': 'application/json',
381        }
382
383        # Values in Google Analytics need to be wrapped in a {'value': x} dict.
384        def convert(x):
385            return {'value': '' if x is None else str(x)}
386
387        # There are limitations on names. For more see
388        # https://support.google.com/analytics/answer/13316687?hl=en.
389        name = f'pw_{payload.event_params["command"]}'
390
391        measurement = {
392            'client_id': self.config['uuid'],
393            'user_properties': {
394                k: convert(v)
395                for k, v in dataclasses.asdict(payload.user_properties).items()
396                if v
397            },
398            'events': [
399                {
400                    'name': name.replace('-', '_'),
401                    'params': payload.event_params,
402                },
403            ],
404        }
405
406        api = self.config['api_secret']
407        meas_id = self.config['measurement_id']
408
409        url = f'{self._url}?api_secret={api}&measurement_id={meas_id}'
410
411        _LOG.debug('POST %s', url)
412        for line in pprint.pformat(measurement).splitlines():
413            _LOG.debug('%s', line)
414
415        raw_json = json.dumps(measurement)
416        _LOG.debug('Raw json: %r', raw_json)
417
418        # Make the request but don't wait very long, and don't surface any
419        # errors to the user.
420        r = requests.post(
421            url,
422            headers=headers,
423            data=raw_json,
424            timeout=3,
425        )
426        try:
427            r.raise_for_status()
428            _LOG.debug('Response-Content: %s', r.content)
429        except requests.exceptions.HTTPError as exc:
430            _LOG.debug('Request failed: %r', exc)
431
432    def begin(self, cwd=None):
433        _LOG.debug('analytics.py begin()')
434        if not self._enabled:
435            return
436        self._start_time = time.monotonic()
437        self._event(self._payload(cwd=cwd))
438
439    def end(self, status, cwd=None):
440        _LOG.debug('analytics.py end()')
441        if not self._enabled:
442            return
443        payload = self._payload(cwd=cwd)
444        payload.event_params['duration_ms'] = int(
445            (time.monotonic() - self._start_time) * 1000
446        )
447        payload.event_params['status'] = status
448        self._event(payload)
449
450
451def _intro():
452    print(
453        """
454================================================================================
455The Pigweed developer tool (`pw`) uses Google Analytics to report usage,
456diagnostic, and error data. This data is used to help improve Pigweed, its
457libraries, and its tools.
458
459Telemetry is not sent on the very first run. To disable reporting of telemetry
460for future invocations, run this terminal command:
461
462    pw cli-analytics --opt-out
463
464If you opt out of telemetry, no further information will be sent. This data is
465collected in accordance with the Google Privacy Policy
466(https://policies.google.com/privacy). For more details on how Pigweed collects
467telemetry, see https://pigweed.dev/pw_cli_analytics.
468================================================================================
469""".strip(),
470        file=sys.stderr,
471    )
472
473
474def initialize() -> State:
475    """Initialize Google Analytics state.
476
477    This checks to see if the user file exists (and thus we have a UUID). If it
478    does, we return immediately. If not, we print the intro message, opt-in
479    (which creates the file and UUID), and return.
480    """
481    global _INITIALIZE_STATE  # pylint: disable=global-statement
482    if _INITIALIZE_STATE is not None:
483        return _INITIALIZE_STATE
484
485    if config.DEFAULT_USER_FILE.is_file():
486        _INITIALIZE_STATE = State.ALREADY_INITIALIZED
487        return _INITIALIZE_STATE
488
489    _intro()
490
491    cli.run(opt='in')
492
493    _INITIALIZE_STATE = State.NEWLY_INITIALIZED
494    return _INITIALIZE_STATE
495
496
497def finalize() -> None:
498    """Finalize Google Analytics state.
499
500    If we printed the intro message at the beginning of the command, print it
501    again at the end. (We may have produced a lot of output in between and we
502    don't want people to miss this.)
503    """
504    if _INITIALIZE_STATE == State.NEWLY_INITIALIZED:
505        _intro()
506