1 // SPDX-License-Identifier: LGPL-2.1
2 /*
3 * Copyright (C) 2020, VMware, Tzvetomir Stoyanov [email protected]>
4 *
5 */
6
7 #include <fcntl.h>
8 #include <stdlib.h>
9 #include <unistd.h>
10 #include <sys/stat.h>
11 #include <dirent.h>
12 #include <ctype.h>
13
14 #include "trace-cmd.h"
15 #include "trace-cmd-private.h"
16 #include "tracefs.h"
17 #include "trace-tsync-local.h"
18
19 #define KVM_DEBUG_FS "/sys/kernel/debug/kvm"
20 #define KVM_DEBUG_OFFSET_FILE "tsc-offset"
21 #define KVM_DEBUG_SCALING_FILE "tsc-scaling-ratio"
22 #define KVM_DEBUG_FRACTION_FILE "tsc-scaling-ratio-frac-bits"
23 #define KVM_DEBUG_VCPU_DIR "vcpu"
24
25 /* default KVM scaling values, taken from the Linux kernel */
26 #define KVM_SCALING_AMD_DEFAULT (1ULL<<32)
27 #define KVM_SCALING_INTEL_DEFAULT (1ULL<<48)
28
29 #define KVM_SYNC_PKT_REQUEST 1
30 #define KVM_SYNC_PKT_RESPONSE 2
31
32 typedef __s64 s64;
33
34 #define KVM_ACCURACY 0
35 #define KVM_NAME "kvm"
36
37 struct kvm_clock_sync {
38 int vcpu_count;
39 char **vcpu_offsets;
40 char **vcpu_scalings;
41 char **vcpu_frac;
42 int marker_fd;
43 struct tep_handle *tep;
44 int raw_id;
45 unsigned long long ts;
46 };
47
48 struct kvm_clock_offset_msg {
49 s64 ts;
50 s64 offset;
51 s64 scaling;
52 s64 frac;
53 };
54
read_ll_from_file(char * file,long long * res)55 static int read_ll_from_file(char *file, long long *res)
56 {
57 char buf[32];
58 int ret;
59 int fd;
60
61 if (!file)
62 return -1;
63 fd = open(file, O_RDONLY | O_NONBLOCK);
64 if (fd < 0)
65 return -1;
66 ret = read(fd, buf, 32);
67 close(fd);
68 if (ret <= 0)
69 return -1;
70
71 *res = strtoll(buf, NULL, 0);
72
73 return 0;
74 }
75
kvm_scaling_check_vm_cpu(char * vname,char * cpu)76 static bool kvm_scaling_check_vm_cpu(char *vname, char *cpu)
77 {
78 long long scaling, frac;
79 bool has_scaling = false;
80 bool has_frac = false;
81 char *path;
82 int ret;
83
84 if (asprintf(&path, "%s/%s/%s", vname, cpu, KVM_DEBUG_SCALING_FILE) < 0)
85 return false;
86 ret = read_ll_from_file(path, &scaling);
87 free(path);
88 if (!ret)
89 has_scaling = true;
90
91 if (asprintf(&path, "%s/%s/%s", vname, cpu, KVM_DEBUG_FRACTION_FILE) < 0)
92 return false;
93 ret = read_ll_from_file(path, &frac);
94 free(path);
95 if (!ret)
96 has_frac = true;
97
98 if (has_scaling != has_frac)
99 return false;
100
101 return true;
102 }
103
kvm_scaling_check_vm(char * name)104 static bool kvm_scaling_check_vm(char *name)
105 {
106 struct dirent *entry;
107 char *vdir;
108 DIR *dir;
109
110 if (asprintf(&vdir, "%s/%s", KVM_DEBUG_FS, name) < 0)
111 return true;
112
113 dir = opendir(vdir);
114 if (!dir) {
115 free(vdir);
116 return true;
117 }
118 while ((entry = readdir(dir))) {
119 if (entry->d_type == DT_DIR && !strncmp(entry->d_name, "vcpu", 4) &&
120 !kvm_scaling_check_vm_cpu(vdir, entry->d_name))
121 break;
122 }
123
124 closedir(dir);
125 free(vdir);
126 return entry == NULL;
127 }
kvm_scaling_check(void)128 static bool kvm_scaling_check(void)
129 {
130 struct dirent *entry;
131 DIR *dir;
132
133 dir = opendir(KVM_DEBUG_FS);
134 if (!dir)
135 return true;
136
137 while ((entry = readdir(dir))) {
138 if (entry->d_type == DT_DIR && isdigit(entry->d_name[0]) &&
139 !kvm_scaling_check_vm(entry->d_name))
140 break;
141 }
142 closedir(dir);
143 return entry == NULL;
144 }
145
kvm_support_check(bool guest)146 static bool kvm_support_check(bool guest)
147 {
148 struct stat st;
149 int ret;
150
151 if (guest)
152 return true;
153
154 ret = stat(KVM_DEBUG_FS, &st);
155 if (ret < 0)
156 return false;
157
158 if (!S_ISDIR(st.st_mode))
159 return false;
160
161 return kvm_scaling_check();
162 }
163
kvm_open_vcpu_dir(struct kvm_clock_sync * kvm,int cpu,char * dir_str)164 static int kvm_open_vcpu_dir(struct kvm_clock_sync *kvm, int cpu, char *dir_str)
165 {
166 struct dirent *entry;
167 char path[PATH_MAX];
168 DIR *dir;
169
170 dir = opendir(dir_str);
171 if (!dir)
172 goto error;
173 while ((entry = readdir(dir))) {
174 if (entry->d_type != DT_DIR) {
175 if (!strcmp(entry->d_name, KVM_DEBUG_OFFSET_FILE)) {
176 snprintf(path, sizeof(path), "%s/%s",
177 dir_str, entry->d_name);
178 kvm->vcpu_offsets[cpu] = strdup(path);
179 }
180 if (!strcmp(entry->d_name, KVM_DEBUG_SCALING_FILE)) {
181 snprintf(path, sizeof(path), "%s/%s",
182 dir_str, entry->d_name);
183 kvm->vcpu_scalings[cpu] = strdup(path);
184 }
185 if (!strcmp(entry->d_name, KVM_DEBUG_FRACTION_FILE)) {
186 snprintf(path, sizeof(path), "%s/%s",
187 dir_str, entry->d_name);
188 kvm->vcpu_frac[cpu] = strdup(path);
189 }
190 }
191 }
192 if (!kvm->vcpu_offsets[cpu])
193 goto error;
194 closedir(dir);
195 return 0;
196
197 error:
198 if (dir)
199 closedir(dir);
200 free(kvm->vcpu_offsets[cpu]);
201 kvm->vcpu_offsets[cpu] = NULL;
202 free(kvm->vcpu_scalings[cpu]);
203 kvm->vcpu_scalings[cpu] = NULL;
204 free(kvm->vcpu_frac[cpu]);
205 kvm->vcpu_frac[cpu] = NULL;
206 return -1;
207 }
208
kvm_open_debug_files(struct kvm_clock_sync * kvm,int pid)209 static int kvm_open_debug_files(struct kvm_clock_sync *kvm, int pid)
210 {
211 char *vm_dir_str = NULL;
212 struct dirent *entry;
213 char *pid_str = NULL;
214 char path[PATH_MAX];
215 long vcpu;
216 DIR *dir;
217 int i;
218
219 dir = opendir(KVM_DEBUG_FS);
220 if (!dir)
221 goto error;
222 if (asprintf(&pid_str, "%d-", pid) <= 0)
223 goto error;
224 while ((entry = readdir(dir))) {
225 if (!(entry->d_type == DT_DIR &&
226 !strncmp(entry->d_name, pid_str, strlen(pid_str))))
227 continue;
228 asprintf(&vm_dir_str, "%s/%s", KVM_DEBUG_FS, entry->d_name);
229 break;
230 }
231 closedir(dir);
232 dir = NULL;
233 if (!vm_dir_str)
234 goto error;
235 dir = opendir(vm_dir_str);
236 if (!dir)
237 goto error;
238 while ((entry = readdir(dir))) {
239 if (!(entry->d_type == DT_DIR &&
240 !strncmp(entry->d_name, KVM_DEBUG_VCPU_DIR, strlen(KVM_DEBUG_VCPU_DIR))))
241 continue;
242 vcpu = strtol(entry->d_name + strlen(KVM_DEBUG_VCPU_DIR), NULL, 10);
243 if (vcpu < 0 || vcpu >= kvm->vcpu_count)
244 continue;
245 snprintf(path, sizeof(path), "%s/%s", vm_dir_str, entry->d_name);
246 if (kvm_open_vcpu_dir(kvm, vcpu, path) < 0)
247 goto error;
248 }
249 for (i = 0; i < kvm->vcpu_count; i++) {
250 if (!kvm->vcpu_offsets[i])
251 goto error;
252 }
253 closedir(dir);
254 free(pid_str);
255 free(vm_dir_str);
256 return 0;
257 error:
258 free(pid_str);
259 free(vm_dir_str);
260 if (dir)
261 closedir(dir);
262 return -1;
263 }
264
kvm_clock_sync_init_host(struct tracecmd_time_sync * tsync,struct kvm_clock_sync * kvm)265 static int kvm_clock_sync_init_host(struct tracecmd_time_sync *tsync,
266 struct kvm_clock_sync *kvm)
267 {
268 kvm->vcpu_count = tsync->vcpu_count;
269 kvm->vcpu_offsets = calloc(kvm->vcpu_count, sizeof(char *));
270 kvm->vcpu_scalings = calloc(kvm->vcpu_count, sizeof(char *));
271 kvm->vcpu_frac = calloc(kvm->vcpu_count, sizeof(char *));
272 if (!kvm->vcpu_offsets || !kvm->vcpu_scalings || !kvm->vcpu_frac)
273 goto error;
274 if (kvm_open_debug_files(kvm, tsync->guest_pid) < 0)
275 goto error;
276 return 0;
277
278 error:
279 free(kvm->vcpu_offsets);
280 free(kvm->vcpu_scalings);
281 free(kvm->vcpu_frac);
282 return -1;
283 }
284
kvm_clock_sync_init_guest(struct tracecmd_time_sync * tsync,struct kvm_clock_sync * kvm)285 static int kvm_clock_sync_init_guest(struct tracecmd_time_sync *tsync,
286 struct kvm_clock_sync *kvm)
287 {
288 const char *systems[] = {"ftrace", NULL};
289 struct clock_sync_context *clock_context;
290 struct tep_event *raw;
291 char *path;
292
293 clock_context = (struct clock_sync_context *)tsync->context;
294 path = tracefs_instance_get_dir(clock_context->instance);
295 if (!path)
296 goto error;
297 kvm->tep = tracefs_local_events_system(path, systems);
298 tracefs_put_tracing_file(path);
299 if (!kvm->tep)
300 goto error;
301 raw = tep_find_event_by_name(kvm->tep, "ftrace", "raw_data");
302 if (!raw)
303 goto error;
304
305 kvm->raw_id = raw->id;
306 tep_set_file_bigendian(kvm->tep, tracecmd_host_bigendian());
307 tep_set_local_bigendian(kvm->tep, tracecmd_host_bigendian());
308
309 path = tracefs_instance_get_file(clock_context->instance, "trace_marker_raw");
310 if (!path)
311 goto error;
312 kvm->marker_fd = open(path, O_WRONLY);
313 tracefs_put_tracing_file(path);
314
315 return 0;
316
317 error:
318 if (kvm->tep)
319 tep_free(kvm->tep);
320 if (kvm->marker_fd >= 0)
321 close(kvm->marker_fd);
322
323 return -1;
324 }
325
kvm_clock_sync_init(struct tracecmd_time_sync * tsync)326 static int kvm_clock_sync_init(struct tracecmd_time_sync *tsync)
327 {
328 struct clock_sync_context *clock_context;
329 struct kvm_clock_sync *kvm;
330 int ret;
331
332 if (!tsync || !tsync->context)
333 return -1;
334 clock_context = (struct clock_sync_context *)tsync->context;
335
336 if (!kvm_support_check(clock_context->is_guest))
337 return -1;
338 kvm = calloc(1, sizeof(struct kvm_clock_sync));
339 if (!kvm)
340 return -1;
341 kvm->marker_fd = -1;
342 if (clock_context->is_guest)
343 ret = kvm_clock_sync_init_guest(tsync, kvm);
344 else
345 ret = kvm_clock_sync_init_host(tsync, kvm);
346 if (ret < 0)
347 goto error;
348
349 clock_context->proto_data = kvm;
350 return 0;
351
352 error:
353 free(kvm);
354 return -1;
355 }
356
kvm_clock_sync_free(struct tracecmd_time_sync * tsync)357 static int kvm_clock_sync_free(struct tracecmd_time_sync *tsync)
358 {
359 struct clock_sync_context *clock_context;
360 struct kvm_clock_sync *kvm = NULL;
361 int i;
362
363 clock_context = (struct clock_sync_context *)tsync->context;
364 if (clock_context)
365 kvm = (struct kvm_clock_sync *)clock_context->proto_data;
366 if (kvm) {
367 for (i = 0; i < kvm->vcpu_count; i++) {
368 free(kvm->vcpu_offsets[i]);
369 kvm->vcpu_offsets[i] = NULL;
370 free(kvm->vcpu_scalings[i]);
371 kvm->vcpu_scalings[i] = NULL;
372 free(kvm->vcpu_frac[i]);
373 kvm->vcpu_frac[i] = NULL;
374 }
375 if (kvm->tep)
376 tep_free(kvm->tep);
377 if (kvm->marker_fd >= 0)
378 close(kvm->marker_fd);
379 free(kvm);
380 }
381 return -1;
382 }
383
kvm_clock_host(struct tracecmd_time_sync * tsync,long long * offset,long long * scaling,long long * frac,long long * timestamp,unsigned int cpu)384 static int kvm_clock_host(struct tracecmd_time_sync *tsync,
385 long long *offset, long long *scaling, long long *frac,
386 long long *timestamp, unsigned int cpu)
387 {
388 char sync_proto[TRACECMD_TSYNC_PNAME_LENGTH];
389 struct clock_sync_context *clock_context;
390 struct kvm_clock_offset_msg packet;
391 struct kvm_clock_sync *kvm = NULL;
392 long long kvm_scaling = 1;
393 unsigned int sync_msg;
394 long long kvm_offset;
395 long long kvm_frac = 0;
396 unsigned int size;
397 char *msg;
398 int ret;
399
400 clock_context = (struct clock_sync_context *)tsync->context;
401 if (clock_context)
402 kvm = (struct kvm_clock_sync *)clock_context->proto_data;
403 if (!kvm || !kvm->vcpu_offsets || !kvm->vcpu_offsets[0])
404 return -1;
405 if (cpu >= kvm->vcpu_count)
406 return -1;
407 ret = read_ll_from_file(kvm->vcpu_offsets[cpu], &kvm_offset);
408 if (ret < 0)
409 return -1;
410
411 if (kvm->vcpu_scalings && kvm->vcpu_scalings[cpu]) {
412 read_ll_from_file(kvm->vcpu_scalings[cpu], &kvm_scaling);
413 if (kvm_scaling == KVM_SCALING_AMD_DEFAULT ||
414 kvm_scaling == KVM_SCALING_INTEL_DEFAULT)
415 kvm_scaling = 1;
416 }
417
418 if (kvm->vcpu_frac && kvm->vcpu_frac[cpu] && kvm_scaling != 1)
419 ret = read_ll_from_file(kvm->vcpu_frac[cpu], &kvm_frac);
420 msg = (char *)&packet;
421 size = sizeof(packet);
422 ret = tracecmd_msg_recv_time_sync(tsync->msg_handle,
423 sync_proto, &sync_msg,
424 &size, &msg);
425 if (ret || strncmp(sync_proto, KVM_NAME, TRACECMD_TSYNC_PNAME_LENGTH) ||
426 sync_msg != KVM_SYNC_PKT_REQUEST)
427 return -1;
428
429 packet.offset = -kvm_offset;
430 packet.scaling = kvm_scaling;
431 packet.frac = kvm_frac;
432 ret = tracecmd_msg_send_time_sync(tsync->msg_handle, KVM_NAME,
433 KVM_SYNC_PKT_RESPONSE, sizeof(packet),
434 (char *)&packet);
435 if (ret)
436 return -1;
437
438 *scaling = packet.scaling;
439 *offset = packet.offset;
440 *frac = kvm_frac;
441 *timestamp = packet.ts;
442
443 return 0;
444 }
445
446 #define KVM_EVENT_MARKER "kvm sync event"
kvm_marker_find(struct tep_event * event,struct tep_record * record,int cpu,void * context)447 static int kvm_marker_find(struct tep_event *event, struct tep_record *record,
448 int cpu, void *context)
449 {
450 struct kvm_clock_sync *kvm = (struct kvm_clock_sync *)context;
451 struct tep_format_field *field;
452 struct tep_format_field *id;
453 char *marker;
454
455 /* Make sure this is our event */
456 if (event->id != kvm->raw_id)
457 return 0;
458 id = tep_find_field(event, "id");
459 field = tep_find_field(event, "buf");
460 if (field && id &&
461 record->size >= (id->offset + strlen(KVM_EVENT_MARKER) + 1)) {
462 marker = (char *)(record->data + id->offset);
463 if (!strcmp(marker, KVM_EVENT_MARKER)) {
464 kvm->ts = record->ts;
465 return 1;
466 }
467 }
468
469 return 0;
470 }
471
kvm_clock_guest(struct tracecmd_time_sync * tsync,long long * offset,long long * scaling,long long * frac,long long * timestamp)472 static int kvm_clock_guest(struct tracecmd_time_sync *tsync,
473 long long *offset,
474 long long *scaling,
475 long long *frac,
476 long long *timestamp)
477 {
478 char sync_proto[TRACECMD_TSYNC_PNAME_LENGTH];
479 struct clock_sync_context *clock_context;
480 struct kvm_clock_offset_msg packet;
481 struct kvm_clock_sync *kvm = NULL;
482 unsigned int sync_msg;
483 unsigned int size;
484 char *msg;
485 int ret;
486
487 clock_context = (struct clock_sync_context *)tsync->context;
488 if (clock_context)
489 kvm = (struct kvm_clock_sync *)clock_context->proto_data;
490 if (!kvm)
491 return -1;
492 kvm->ts = 0;
493 memset(&packet, 0, sizeof(packet));
494 tracefs_instance_file_write(clock_context->instance, "trace", "\0");
495 write(kvm->marker_fd, KVM_EVENT_MARKER, strlen(KVM_EVENT_MARKER) + 1);
496 kvm->ts = 0;
497 tracefs_iterate_raw_events(kvm->tep, clock_context->instance,
498 NULL, 0, kvm_marker_find, kvm);
499 packet.ts = kvm->ts;
500 ret = tracecmd_msg_send_time_sync(tsync->msg_handle, KVM_NAME,
501 KVM_SYNC_PKT_REQUEST, sizeof(packet),
502 (char *)&packet);
503 if (ret)
504 return -1;
505 msg = (char *)&packet;
506 size = sizeof(packet);
507 ret = tracecmd_msg_recv_time_sync(tsync->msg_handle,
508 sync_proto, &sync_msg,
509 &size, &msg);
510 if (ret || strncmp(sync_proto, KVM_NAME, TRACECMD_TSYNC_PNAME_LENGTH) ||
511 sync_msg != KVM_SYNC_PKT_RESPONSE)
512 return -1;
513
514 *scaling = packet.scaling;
515 *offset = packet.offset;
516 *frac = packet.frac;
517 *timestamp = packet.ts;
518 return 0;
519 }
520
kvm_clock_sync_calc(struct tracecmd_time_sync * tsync,long long * offset,long long * scaling,long long * frac,long long * timestamp,unsigned int cpu)521 static int kvm_clock_sync_calc(struct tracecmd_time_sync *tsync,
522 long long *offset, long long *scaling, long long *frac,
523 long long *timestamp, unsigned int cpu)
524 {
525 struct clock_sync_context *clock_context;
526 int ret;
527
528 if (!tsync || !tsync->context)
529 return -1;
530
531 clock_context = (struct clock_sync_context *)tsync->context;
532
533 if (clock_context->is_guest)
534 ret = kvm_clock_guest(tsync, offset, scaling, frac, timestamp);
535 else
536 ret = kvm_clock_host(tsync, offset, scaling, frac, timestamp, cpu);
537 return ret;
538 }
539
kvm_clock_sync_register(void)540 int kvm_clock_sync_register(void)
541 {
542 int role = TRACECMD_TIME_SYNC_ROLE_GUEST;
543 int clock = 0;
544
545 if (kvm_support_check(false)) {
546 role |= TRACECMD_TIME_SYNC_ROLE_HOST;
547 clock = TRACECMD_CLOCK_X86_TSC;
548 }
549 return tracecmd_tsync_proto_register(KVM_NAME, KVM_ACCURACY,
550 role, clock, 0,
551 kvm_clock_sync_init,
552 kvm_clock_sync_free,
553 kvm_clock_sync_calc);
554 }
555
kvm_clock_sync_unregister(void)556 int kvm_clock_sync_unregister(void)
557 {
558 return tracecmd_tsync_proto_unregister(KVM_NAME);
559 }
560