1# Copyright 2024 The Pigweed Authors 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); you may not 4# use this file except in compliance with the License. You may obtain a copy of 5# the License at 6# 7# https://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 11# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 12# License for the specific language governing permissions and limitations under 13# the License. 14"""Report usage to Google Analytics.""" 15 16import argparse 17import dataclasses 18import enum 19import getpass 20import json 21import logging 22import multiprocessing 23import os 24from pathlib import Path 25import platform 26import pprint 27import re 28import subprocess 29import sys 30from typing import Any, Sequence 31import time 32 33import requests 34 35import pw_env_setup.config_file 36 37from . import cli 38from . import config 39 40_LOG: logging.Logger = logging.getLogger(__name__) 41 42SAFE_SUBCOMMANDS_TO_ALWAYS_REPORT = ( 43 'analytics', 44 'bloat', 45 'build', 46 'console', 47 'doctor', 48 'emu', 49 'format', 50 'heap-viewer', 51 'help', 52 'ide', 53 'keep-sorted', 54 'logdemo', 55 'module', 56 'package', 57 'presubmit', 58 'python-packages', 59 'requires', 60 'rpc', 61 'test', 62 'update', 63 'watch', 64) 65 66 67def _upstream_remote(cwd: Path | str | None = None) -> str | None: 68 """Return the remote for this branch, or None if there are errors.""" 69 upstream = '' 70 prev_upstreams = [upstream] 71 while '/' not in upstream: 72 try: 73 upstream = ( 74 subprocess.run( 75 [ 76 'git', 77 'rev-parse', 78 '--abbrev-ref', 79 '--symbolic-full-name', 80 f'{upstream}@{{upstream}}', 81 ], 82 check=True, 83 cwd=cwd, 84 stdout=subprocess.PIPE, 85 stderr=subprocess.DEVNULL, 86 ) 87 .stdout.decode() 88 .strip() 89 ) 90 except subprocess.CalledProcessError: 91 return None 92 prev_upstreams.append(upstream) 93 94 if upstream in prev_upstreams: 95 return None 96 97 remote = upstream.split('/')[0] 98 try: 99 url = ( 100 subprocess.run( 101 ['git', 'config', '--get', f'remote.{remote}.url'], 102 check=True, 103 cwd=cwd, 104 stdout=subprocess.PIPE, 105 stderr=subprocess.DEVNULL, 106 ) 107 .stdout.decode() 108 .strip() 109 ) 110 except subprocess.CalledProcessError: 111 return None 112 113 # Don't return local paths. 114 if ':' not in url: 115 return None 116 117 return url 118 119 120def _fallback_remote(cwd: Path | str | None = None) -> str | None: 121 """Get all the remotes and use some heuristics to pick one of them.""" 122 remotes = {} 123 124 try: 125 result = subprocess.run( 126 ['git', 'config', '--get-regexp', r'^remote\..*\.url$'], 127 check=True, 128 cwd=cwd, 129 stdout=subprocess.PIPE, 130 stderr=subprocess.DEVNULL, 131 ) 132 except subprocess.CalledProcessError: 133 return None 134 135 for line in result.stdout.decode().strip().splitlines(): 136 branch, remote = line.split() 137 138 # Exclude remotes 139 if ':' not in remote: 140 continue 141 142 if branch.startswith('remote.'): 143 branch = branch[len('remote.') :] 144 if branch.endswith('.url'): 145 branch = branch[: -len('.url')] 146 remotes[branch] = remote 147 148 # Prefer the remote named 'origin'. 149 if 'origin' in remotes: 150 return remotes['origin'] 151 152 # Filter to the remotes that occur most often. Then pick one of those 153 # arbitrarily. 154 values = list(remotes.values()) 155 return max(set(values), key=values.count) 156 157 158def _remote(cwd: Path | str | None = None) -> str | None: 159 url = _upstream_remote(cwd=cwd) or _fallback_remote(cwd=cwd) 160 if not url: 161 return None 162 163 # If there's a username in the url, remove it. 164 return re.sub(r'[\w_]+@', '<username>@', url) 165 166 167@dataclasses.dataclass 168class GerritCommit: 169 commit: str 170 change_id: int 171 172 173def _pigweed_commit(cwd: Path | str | None = None) -> GerritCommit | None: 174 """Find the most recently submitted Pigweed commit in use by this checkout. 175 176 Returns: Tuple of the Git commit hash and the Gerrit change id. 177 """ 178 pw_root = cwd or os.environ.get('PW_ROOT') 179 if not pw_root: 180 return None 181 182 proc = subprocess.Popen( 183 ['git', 'log'], 184 cwd=pw_root, 185 stdout=subprocess.PIPE, 186 stderr=subprocess.DEVNULL, 187 ) 188 189 commit = None 190 assert proc.stdout 191 for i, raw_line in enumerate(proc.stdout): 192 # If we get this far in we've spent too much time and should give up. 193 if i > 100000: 194 break 195 196 line = raw_line.decode(encoding='utf-8') 197 parts = line.split() 198 if len(parts) == 2 and parts[0] == 'commit': 199 commit = parts[1] 200 continue 201 202 pw_remote = 'https://pigweed-review.googlesource.com/c/pigweed/pigweed' 203 if line.strip().startswith(f'Reviewed-on: {pw_remote}'): 204 assert commit 205 change_id = int(line.split('/')[-1]) 206 return GerritCommit(commit, change_id) 207 208 return None 209 210 211@dataclasses.dataclass 212class UserProperties: 213 luci_buildbucket_id: str = os.environ.get('BUILDBUCKET_ID', '') 214 luci_buildbucket_name: str = os.environ.get('BUILDBUCKET_NAME', '') 215 luci_swarming_id: str = os.environ.get('SWARMING_TASK_ID', '') 216 num_cores: int = multiprocessing.cpu_count() 217 cpu_architecture: str = platform.machine() 218 cpu_bits: int = 64 if sys.maxsize > 2**32 else 32 219 os_name: str = platform.system() 220 automated_build: bool = False 221 222 def __post_init__(self): 223 # TODO: b/319320838 - Figure out how to tell if running in TreeHugger. 224 if self.luci_buildbucket_id or self.luci_swarming_id: 225 self.automated_build = True 226 227 228@dataclasses.dataclass 229class Payload: 230 user_properties: UserProperties = dataclasses.field( 231 default_factory=UserProperties, 232 ) 233 event_params: dict[str, int | str | None] = dataclasses.field( 234 default_factory=dict 235 ) 236 237 238def remove_username( 239 arg: Path | str, 240 *, 241 username: str = getpass.getuser(), 242 home: str = os.path.expanduser('~'), 243) -> str: 244 """Remove usernames from Path-like objects. 245 246 Examples: 247 /home/user/foo/bar => ~/foo/bar 248 /home/user/foo/user/bar => ~/foo/$USERNAME/bar 249 /data/foo/bar => /data/foo/bar 250 --force => --force 251 252 Args: 253 arg: Possible Path-like object. 254 username: Username of current user. 255 home: Home directory of current user. 256 """ 257 arg = str(arg) 258 if arg.startswith(home): 259 arg = f'$HOME{arg[len(home):]}' 260 261 return re.sub(rf'\b{username}\b', '$USERNAME', arg) 262 263 264class State(enum.Enum): 265 UNKNOWN = 0 266 ALREADY_INITIALIZED = 1 267 NEWLY_INITIALIZED = 2 268 269 270_INITIALIZE_STATE: State | None = None 271 272 273class Analytics: 274 """Report usage to Google Analytics.""" 275 276 def __init__( 277 self, 278 argv: Sequence[str], 279 parsed_args: argparse.Namespace, 280 *args, 281 debug: bool = False, 282 **kwargs, 283 ): 284 """Initializes an Analytics object. 285 286 Args: 287 argv: List of arguments passed to this script. 288 parsed_args: The parsed representation of those arguments. 289 *args: Arguments to pass through to super(). 290 debug: Whether to use the debug host. 291 **kwargs: Arguments to pass through to super(). 292 """ 293 294 super().__init__(*args, **kwargs) 295 296 self._argv: Sequence[str] = argv[:] 297 self._parsed_args: argparse.Namespace = parsed_args 298 299 self._debug: bool = debug 300 self._start_time: float | None = None 301 302 self.config = config.AnalyticsPrefs() 303 304 self._url: str = self.config['prod_url'] 305 if debug: 306 self._url = self.config['debug_url'] 307 308 self._enabled = True 309 if self.config['uuid'] is None or not self.config['enabled']: 310 self._enabled = False 311 elif self._parsed_args.command == 'analytics': 312 self._enabled = False 313 elif _INITIALIZE_STATE == State.NEWLY_INITIALIZED: 314 self._enabled = False 315 316 def _payload(self, cwd: Path) -> Payload: 317 """Create a Payload object. 318 319 Create a Payload object based on the system and the arguments to the 320 current script based on the config. 321 322 See https://pigweed.dev/pw_cli_analytics for details about these values. 323 324 Args: 325 cwd: Checkout directory. 326 """ 327 payload = Payload() 328 payload.event_params['command'] = 'redacted' 329 330 rsn = self.config['report_subcommand_name'] 331 if rsn == 'always': 332 payload.event_params['command'] = self._parsed_args.command 333 334 elif rsn == 'limited': 335 if self._parsed_args.command in SAFE_SUBCOMMANDS_TO_ALWAYS_REPORT: 336 payload.event_params['command'] = self._parsed_args.command 337 338 elif rsn == 'never': 339 pass 340 341 else: 342 raise ValueError(f'unexpected report_subcommand_name value {rsn!r}') 343 344 if self.config['report_command_line']: 345 if rsn != 'always': 346 raise ValueError( 347 'report_command_line is True but report_subcommand_name ' 348 f'is {rsn!r}' 349 ) 350 for i, arg in enumerate(self._argv): 351 if i >= 10: 352 break 353 payload.event_params[f'argv_{i}'] = remove_username(arg) 354 355 if self.config['report_project_name']: 356 pw_config: dict[str, Any] = pw_env_setup.config_file.load().get( 357 'pw', {} 358 ) 359 root_var = pw_config.get('pw_env_setup', {}).get('root_variable') 360 payload.event_params['project_root_var'] = root_var 361 362 if self.config['report_remote_url']: 363 payload.event_params['git_remote'] = _remote(cwd=cwd) 364 365 pw_commit = _pigweed_commit() 366 if pw_commit: 367 payload.event_params['pigweed_commit'] = pw_commit.commit 368 payload.event_params['pigweed_change_id'] = pw_commit.change_id 369 370 return payload 371 372 def _event(self, payload: Payload): 373 """Send a Payload to Google Analytics.""" 374 375 if not self._enabled: 376 return 377 378 headers = { 379 'User-Agent': 'pw-command-line-tool', 380 'Content-Type': 'application/json', 381 } 382 383 # Values in Google Analytics need to be wrapped in a {'value': x} dict. 384 def convert(x): 385 return {'value': '' if x is None else str(x)} 386 387 # There are limitations on names. For more see 388 # https://support.google.com/analytics/answer/13316687?hl=en. 389 name = f'pw_{payload.event_params["command"]}' 390 391 measurement = { 392 'client_id': self.config['uuid'], 393 'user_properties': { 394 k: convert(v) 395 for k, v in dataclasses.asdict(payload.user_properties).items() 396 if v 397 }, 398 'events': [ 399 { 400 'name': name.replace('-', '_'), 401 'params': payload.event_params, 402 }, 403 ], 404 } 405 406 api = self.config['api_secret'] 407 meas_id = self.config['measurement_id'] 408 409 url = f'{self._url}?api_secret={api}&measurement_id={meas_id}' 410 411 _LOG.debug('POST %s', url) 412 for line in pprint.pformat(measurement).splitlines(): 413 _LOG.debug('%s', line) 414 415 raw_json = json.dumps(measurement) 416 _LOG.debug('Raw json: %r', raw_json) 417 418 # Make the request but don't wait very long, and don't surface any 419 # errors to the user. 420 r = requests.post( 421 url, 422 headers=headers, 423 data=raw_json, 424 timeout=3, 425 ) 426 try: 427 r.raise_for_status() 428 _LOG.debug('Response-Content: %s', r.content) 429 except requests.exceptions.HTTPError as exc: 430 _LOG.debug('Request failed: %r', exc) 431 432 def begin(self, cwd=None): 433 _LOG.debug('analytics.py begin()') 434 if not self._enabled: 435 return 436 self._start_time = time.monotonic() 437 self._event(self._payload(cwd=cwd)) 438 439 def end(self, status, cwd=None): 440 _LOG.debug('analytics.py end()') 441 if not self._enabled: 442 return 443 payload = self._payload(cwd=cwd) 444 payload.event_params['duration_ms'] = int( 445 (time.monotonic() - self._start_time) * 1000 446 ) 447 payload.event_params['status'] = status 448 self._event(payload) 449 450 451def _intro(): 452 print( 453 """ 454================================================================================ 455The Pigweed developer tool (`pw`) uses Google Analytics to report usage, 456diagnostic, and error data. This data is used to help improve Pigweed, its 457libraries, and its tools. 458 459Telemetry is not sent on the very first run. To disable reporting of telemetry 460for future invocations, run this terminal command: 461 462 pw cli-analytics --opt-out 463 464If you opt out of telemetry, no further information will be sent. This data is 465collected in accordance with the Google Privacy Policy 466(https://policies.google.com/privacy). For more details on how Pigweed collects 467telemetry, see https://pigweed.dev/pw_cli_analytics. 468================================================================================ 469""".strip(), 470 file=sys.stderr, 471 ) 472 473 474def initialize() -> State: 475 """Initialize Google Analytics state. 476 477 This checks to see if the user file exists (and thus we have a UUID). If it 478 does, we return immediately. If not, we print the intro message, opt-in 479 (which creates the file and UUID), and return. 480 """ 481 global _INITIALIZE_STATE # pylint: disable=global-statement 482 if _INITIALIZE_STATE is not None: 483 return _INITIALIZE_STATE 484 485 if config.DEFAULT_USER_FILE.is_file(): 486 _INITIALIZE_STATE = State.ALREADY_INITIALIZED 487 return _INITIALIZE_STATE 488 489 _intro() 490 491 cli.run(opt='in') 492 493 _INITIALIZE_STATE = State.NEWLY_INITIALIZED 494 return _INITIALIZE_STATE 495 496 497def finalize() -> None: 498 """Finalize Google Analytics state. 499 500 If we printed the intro message at the beginning of the command, print it 501 again at the end. (We may have produced a lot of output in between and we 502 don't want people to miss this.) 503 """ 504 if _INITIALIZE_STATE == State.NEWLY_INITIALIZED: 505 _intro() 506