linux/kernel/trace/trace_syscalls.c
<<
>>
Prefs
   1#include <trace/syscall.h>
   2#include <trace/events/syscalls.h>
   3#include <linux/slab.h>
   4#include <linux/kernel.h>
   5#include <linux/ftrace.h>
   6#include <linux/perf_event.h>
   7#include <asm/syscall.h>
   8
   9#include "trace_output.h"
  10#include "trace.h"
  11
  12static DEFINE_MUTEX(syscall_trace_lock);
  13static int sys_refcount_enter;
  14static int sys_refcount_exit;
  15static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
  16static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
  17
  18static int syscall_enter_register(struct ftrace_event_call *event,
  19                                 enum trace_reg type);
  20static int syscall_exit_register(struct ftrace_event_call *event,
  21                                 enum trace_reg type);
  22
  23static int syscall_enter_define_fields(struct ftrace_event_call *call);
  24static int syscall_exit_define_fields(struct ftrace_event_call *call);
  25
  26static struct list_head *
  27syscall_get_enter_fields(struct ftrace_event_call *call)
  28{
  29        struct syscall_metadata *entry = call->data;
  30
  31        return &entry->enter_fields;
  32}
  33
  34static struct list_head *
  35syscall_get_exit_fields(struct ftrace_event_call *call)
  36{
  37        struct syscall_metadata *entry = call->data;
  38
  39        return &entry->exit_fields;
  40}
  41
  42struct trace_event_functions enter_syscall_print_funcs = {
  43        .trace                  = print_syscall_enter,
  44};
  45
  46struct trace_event_functions exit_syscall_print_funcs = {
  47        .trace                  = print_syscall_exit,
  48};
  49
  50struct ftrace_event_class event_class_syscall_enter = {
  51        .system                 = "syscalls",
  52        .reg                    = syscall_enter_register,
  53        .define_fields          = syscall_enter_define_fields,
  54        .get_fields             = syscall_get_enter_fields,
  55        .raw_init               = init_syscall_trace,
  56};
  57
  58struct ftrace_event_class event_class_syscall_exit = {
  59        .system                 = "syscalls",
  60        .reg                    = syscall_exit_register,
  61        .define_fields          = syscall_exit_define_fields,
  62        .get_fields             = syscall_get_exit_fields,
  63        .raw_init               = init_syscall_trace,
  64};
  65
  66extern unsigned long __start_syscalls_metadata[];
  67extern unsigned long __stop_syscalls_metadata[];
  68
  69static struct syscall_metadata **syscalls_metadata;
  70
  71static struct syscall_metadata *find_syscall_meta(unsigned long syscall)
  72{
  73        struct syscall_metadata *start;
  74        struct syscall_metadata *stop;
  75        char str[KSYM_SYMBOL_LEN];
  76
  77
  78        start = (struct syscall_metadata *)__start_syscalls_metadata;
  79        stop = (struct syscall_metadata *)__stop_syscalls_metadata;
  80        kallsyms_lookup(syscall, NULL, NULL, NULL, str);
  81
  82        for ( ; start < stop; start++) {
  83                /*
  84                 * Only compare after the "sys" prefix. Archs that use
  85                 * syscall wrappers may have syscalls symbols aliases prefixed
  86                 * with "SyS" instead of "sys", leading to an unwanted
  87                 * mismatch.
  88                 */
  89                if (start->name && !strcmp(start->name + 3, str + 3))
  90                        return start;
  91        }
  92        return NULL;
  93}
  94
  95static struct syscall_metadata *syscall_nr_to_meta(int nr)
  96{
  97        if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
  98                return NULL;
  99
 100        return syscalls_metadata[nr];
 101}
 102
 103enum print_line_t
 104print_syscall_enter(struct trace_iterator *iter, int flags,
 105                    struct trace_event *event)
 106{
 107        struct trace_seq *s = &iter->seq;
 108        struct trace_entry *ent = iter->ent;
 109        struct syscall_trace_enter *trace;
 110        struct syscall_metadata *entry;
 111        int i, ret, syscall;
 112
 113        trace = (typeof(trace))ent;
 114        syscall = trace->nr;
 115        entry = syscall_nr_to_meta(syscall);
 116
 117        if (!entry)
 118                goto end;
 119
 120        if (entry->enter_event->event.type != ent->type) {
 121                WARN_ON_ONCE(1);
 122                goto end;
 123        }
 124
 125        ret = trace_seq_printf(s, "%s(", entry->name);
 126        if (!ret)
 127                return TRACE_TYPE_PARTIAL_LINE;
 128
 129        for (i = 0; i < entry->nb_args; i++) {
 130                /* parameter types */
 131                if (trace_flags & TRACE_ITER_VERBOSE) {
 132                        ret = trace_seq_printf(s, "%s ", entry->types[i]);
 133                        if (!ret)
 134                                return TRACE_TYPE_PARTIAL_LINE;
 135                }
 136                /* parameter values */
 137                ret = trace_seq_printf(s, "%s: %lx%s", entry->args[i],
 138                                       trace->args[i],
 139                                       i == entry->nb_args - 1 ? "" : ", ");
 140                if (!ret)
 141                        return TRACE_TYPE_PARTIAL_LINE;
 142        }
 143
 144        ret = trace_seq_putc(s, ')');
 145        if (!ret)
 146                return TRACE_TYPE_PARTIAL_LINE;
 147
 148end:
 149        ret =  trace_seq_putc(s, '\n');
 150        if (!ret)
 151                return TRACE_TYPE_PARTIAL_LINE;
 152
 153        return TRACE_TYPE_HANDLED;
 154}
 155
 156enum print_line_t
 157print_syscall_exit(struct trace_iterator *iter, int flags,
 158                   struct trace_event *event)
 159{
 160        struct trace_seq *s = &iter->seq;
 161        struct trace_entry *ent = iter->ent;
 162        struct syscall_trace_exit *trace;
 163        int syscall;
 164        struct syscall_metadata *entry;
 165        int ret;
 166
 167        trace = (typeof(trace))ent;
 168        syscall = trace->nr;
 169        entry = syscall_nr_to_meta(syscall);
 170
 171        if (!entry) {
 172                trace_seq_printf(s, "\n");
 173                return TRACE_TYPE_HANDLED;
 174        }
 175
 176        if (entry->exit_event->event.type != ent->type) {
 177                WARN_ON_ONCE(1);
 178                return TRACE_TYPE_UNHANDLED;
 179        }
 180
 181        ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name,
 182                                trace->ret);
 183        if (!ret)
 184                return TRACE_TYPE_PARTIAL_LINE;
 185
 186        return TRACE_TYPE_HANDLED;
 187}
 188
 189extern char *__bad_type_size(void);
 190
 191#define SYSCALL_FIELD(type, name)                                       \
 192        sizeof(type) != sizeof(trace.name) ?                            \
 193                __bad_type_size() :                                     \
 194                #type, #name, offsetof(typeof(trace), name),            \
 195                sizeof(trace.name), is_signed_type(type)
 196
 197static
 198int  __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
 199{
 200        int i;
 201        int pos = 0;
 202
 203        /* When len=0, we just calculate the needed length */
 204#define LEN_OR_ZERO (len ? len - pos : 0)
 205
 206        pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
 207        for (i = 0; i < entry->nb_args; i++) {
 208                pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx%s",
 209                                entry->args[i], sizeof(unsigned long),
 210                                i == entry->nb_args - 1 ? "" : ", ");
 211        }
 212        pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
 213
 214        for (i = 0; i < entry->nb_args; i++) {
 215                pos += snprintf(buf + pos, LEN_OR_ZERO,
 216                                ", ((unsigned long)(REC->%s))", entry->args[i]);
 217        }
 218
 219#undef LEN_OR_ZERO
 220
 221        /* return the length of print_fmt */
 222        return pos;
 223}
 224
 225static int set_syscall_print_fmt(struct ftrace_event_call *call)
 226{
 227        char *print_fmt;
 228        int len;
 229        struct syscall_metadata *entry = call->data;
 230
 231        if (entry->enter_event != call) {
 232                call->print_fmt = "\"0x%lx\", REC->ret";
 233                return 0;
 234        }
 235
 236        /* First: called with 0 length to calculate the needed length */
 237        len = __set_enter_print_fmt(entry, NULL, 0);
 238
 239        print_fmt = kmalloc(len + 1, GFP_KERNEL);
 240        if (!print_fmt)
 241                return -ENOMEM;
 242
 243        /* Second: actually write the @print_fmt */
 244        __set_enter_print_fmt(entry, print_fmt, len + 1);
 245        call->print_fmt = print_fmt;
 246
 247        return 0;
 248}
 249
 250static void free_syscall_print_fmt(struct ftrace_event_call *call)
 251{
 252        struct syscall_metadata *entry = call->data;
 253
 254        if (entry->enter_event == call)
 255                kfree(call->print_fmt);
 256}
 257
 258static int syscall_enter_define_fields(struct ftrace_event_call *call)
 259{
 260        struct syscall_trace_enter trace;
 261        struct syscall_metadata *meta = call->data;
 262        int ret;
 263        int i;
 264        int offset = offsetof(typeof(trace), args);
 265
 266        ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
 267        if (ret)
 268                return ret;
 269
 270        for (i = 0; i < meta->nb_args; i++) {
 271                ret = trace_define_field(call, meta->types[i],
 272                                         meta->args[i], offset,
 273                                         sizeof(unsigned long), 0,
 274                                         FILTER_OTHER);
 275                offset += sizeof(unsigned long);
 276        }
 277
 278        return ret;
 279}
 280
 281static int syscall_exit_define_fields(struct ftrace_event_call *call)
 282{
 283        struct syscall_trace_exit trace;
 284        int ret;
 285
 286        ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
 287        if (ret)
 288                return ret;
 289
 290        ret = trace_define_field(call, SYSCALL_FIELD(long, ret),
 291                                 FILTER_OTHER);
 292
 293        return ret;
 294}
 295
 296void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 297{
 298        struct syscall_trace_enter *entry;
 299        struct syscall_metadata *sys_data;
 300        struct ring_buffer_event *event;
 301        struct ring_buffer *buffer;
 302        int size;
 303        int syscall_nr;
 304
 305        syscall_nr = syscall_get_nr(current, regs);
 306        if (syscall_nr < 0)
 307                return;
 308        if (!test_bit(syscall_nr, enabled_enter_syscalls))
 309                return;
 310
 311        sys_data = syscall_nr_to_meta(syscall_nr);
 312        if (!sys_data)
 313                return;
 314
 315        size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
 316
 317        event = trace_current_buffer_lock_reserve(&buffer,
 318                        sys_data->enter_event->event.type, size, 0, 0);
 319        if (!event)
 320                return;
 321
 322        entry = ring_buffer_event_data(event);
 323        entry->nr = syscall_nr;
 324        syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args);
 325
 326        if (!filter_current_check_discard(buffer, sys_data->enter_event,
 327                                          entry, event))
 328                trace_current_buffer_unlock_commit(buffer, event, 0, 0);
 329}
 330
 331void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 332{
 333        struct syscall_trace_exit *entry;
 334        struct syscall_metadata *sys_data;
 335        struct ring_buffer_event *event;
 336        struct ring_buffer *buffer;
 337        int syscall_nr;
 338
 339        syscall_nr = syscall_get_nr(current, regs);
 340        if (syscall_nr < 0)
 341                return;
 342        if (!test_bit(syscall_nr, enabled_exit_syscalls))
 343                return;
 344
 345        sys_data = syscall_nr_to_meta(syscall_nr);
 346        if (!sys_data)
 347                return;
 348
 349        event = trace_current_buffer_lock_reserve(&buffer,
 350                        sys_data->exit_event->event.type, sizeof(*entry), 0, 0);
 351        if (!event)
 352                return;
 353
 354        entry = ring_buffer_event_data(event);
 355        entry->nr = syscall_nr;
 356        entry->ret = syscall_get_return_value(current, regs);
 357
 358        if (!filter_current_check_discard(buffer, sys_data->exit_event,
 359                                          entry, event))
 360                trace_current_buffer_unlock_commit(buffer, event, 0, 0);
 361}
 362
 363int reg_event_syscall_enter(struct ftrace_event_call *call)
 364{
 365        int ret = 0;
 366        int num;
 367
 368        num = ((struct syscall_metadata *)call->data)->syscall_nr;
 369        if (num < 0 || num >= NR_syscalls)
 370                return -ENOSYS;
 371        mutex_lock(&syscall_trace_lock);
 372        if (!sys_refcount_enter)
 373                ret = register_trace_sys_enter(ftrace_syscall_enter, NULL);
 374        if (!ret) {
 375                set_bit(num, enabled_enter_syscalls);
 376                sys_refcount_enter++;
 377        }
 378        mutex_unlock(&syscall_trace_lock);
 379        return ret;
 380}
 381
 382void unreg_event_syscall_enter(struct ftrace_event_call *call)
 383{
 384        int num;
 385
 386        num = ((struct syscall_metadata *)call->data)->syscall_nr;
 387        if (num < 0 || num >= NR_syscalls)
 388                return;
 389        mutex_lock(&syscall_trace_lock);
 390        sys_refcount_enter--;
 391        clear_bit(num, enabled_enter_syscalls);
 392        if (!sys_refcount_enter)
 393                unregister_trace_sys_enter(ftrace_syscall_enter, NULL);
 394        mutex_unlock(&syscall_trace_lock);
 395}
 396
 397int reg_event_syscall_exit(struct ftrace_event_call *call)
 398{
 399        int ret = 0;
 400        int num;
 401
 402        num = ((struct syscall_metadata *)call->data)->syscall_nr;
 403        if (num < 0 || num >= NR_syscalls)
 404                return -ENOSYS;
 405        mutex_lock(&syscall_trace_lock);
 406        if (!sys_refcount_exit)
 407                ret = register_trace_sys_exit(ftrace_syscall_exit, NULL);
 408        if (!ret) {
 409                set_bit(num, enabled_exit_syscalls);
 410                sys_refcount_exit++;
 411        }
 412        mutex_unlock(&syscall_trace_lock);
 413        return ret;
 414}
 415
 416void unreg_event_syscall_exit(struct ftrace_event_call *call)
 417{
 418        int num;
 419
 420        num = ((struct syscall_metadata *)call->data)->syscall_nr;
 421        if (num < 0 || num >= NR_syscalls)
 422                return;
 423        mutex_lock(&syscall_trace_lock);
 424        sys_refcount_exit--;
 425        clear_bit(num, enabled_exit_syscalls);
 426        if (!sys_refcount_exit)
 427                unregister_trace_sys_exit(ftrace_syscall_exit, NULL);
 428        mutex_unlock(&syscall_trace_lock);
 429}
 430
 431int init_syscall_trace(struct ftrace_event_call *call)
 432{
 433        int id;
 434
 435        if (set_syscall_print_fmt(call) < 0)
 436                return -ENOMEM;
 437
 438        id = trace_event_raw_init(call);
 439
 440        if (id < 0) {
 441                free_syscall_print_fmt(call);
 442                return id;
 443        }
 444
 445        return id;
 446}
 447
 448unsigned long __init arch_syscall_addr(int nr)
 449{
 450        return (unsigned long)sys_call_table[nr];
 451}
 452
 453int __init init_ftrace_syscalls(void)
 454{
 455        struct syscall_metadata *meta;
 456        unsigned long addr;
 457        int i;
 458
 459        syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) *
 460                                        NR_syscalls, GFP_KERNEL);
 461        if (!syscalls_metadata) {
 462                WARN_ON(1);
 463                return -ENOMEM;
 464        }
 465
 466        for (i = 0; i < NR_syscalls; i++) {
 467                addr = arch_syscall_addr(i);
 468                meta = find_syscall_meta(addr);
 469                if (!meta)
 470                        continue;
 471
 472                meta->syscall_nr = i;
 473                syscalls_metadata[i] = meta;
 474        }
 475
 476        return 0;
 477}
 478core_initcall(init_ftrace_syscalls);
 479
 480#ifdef CONFIG_PERF_EVENTS
 481
 482static DECLARE_BITMAP(enabled_perf_enter_syscalls, NR_syscalls);
 483static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
 484static int sys_perf_refcount_enter;
 485static int sys_perf_refcount_exit;
 486
 487static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 488{
 489        struct syscall_metadata *sys_data;
 490        struct syscall_trace_enter *rec;
 491        struct hlist_head *head;
 492        int syscall_nr;
 493        int rctx;
 494        int size;
 495
 496        syscall_nr = syscall_get_nr(current, regs);
 497        if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))
 498                return;
 499
 500        sys_data = syscall_nr_to_meta(syscall_nr);
 501        if (!sys_data)
 502                return;
 503
 504        /* get the size after alignment with the u32 buffer size field */
 505        size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec);
 506        size = ALIGN(size + sizeof(u32), sizeof(u64));
 507        size -= sizeof(u32);
 508
 509        if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
 510                      "perf buffer not large enough"))
 511                return;
 512
 513        rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
 514                                sys_data->enter_event->event.type, regs, &rctx);
 515        if (!rec)
 516                return;
 517
 518        rec->nr = syscall_nr;
 519        syscall_get_arguments(current, regs, 0, sys_data->nb_args,
 520                               (unsigned long *)&rec->args);
 521
 522        head = this_cpu_ptr(sys_data->enter_event->perf_events);
 523        perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
 524}
 525
 526int perf_sysenter_enable(struct ftrace_event_call *call)
 527{
 528        int ret = 0;
 529        int num;
 530
 531        num = ((struct syscall_metadata *)call->data)->syscall_nr;
 532
 533        mutex_lock(&syscall_trace_lock);
 534        if (!sys_perf_refcount_enter)
 535                ret = register_trace_sys_enter(perf_syscall_enter, NULL);
 536        if (ret) {
 537                pr_info("event trace: Could not activate"
 538                                "syscall entry trace point");
 539        } else {
 540                set_bit(num, enabled_perf_enter_syscalls);
 541                sys_perf_refcount_enter++;
 542        }
 543        mutex_unlock(&syscall_trace_lock);
 544        return ret;
 545}
 546
 547void perf_sysenter_disable(struct ftrace_event_call *call)
 548{
 549        int num;
 550
 551        num = ((struct syscall_metadata *)call->data)->syscall_nr;
 552
 553        mutex_lock(&syscall_trace_lock);
 554        sys_perf_refcount_enter--;
 555        clear_bit(num, enabled_perf_enter_syscalls);
 556        if (!sys_perf_refcount_enter)
 557                unregister_trace_sys_enter(perf_syscall_enter, NULL);
 558        mutex_unlock(&syscall_trace_lock);
 559}
 560
 561static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 562{
 563        struct syscall_metadata *sys_data;
 564        struct syscall_trace_exit *rec;
 565        struct hlist_head *head;
 566        int syscall_nr;
 567        int rctx;
 568        int size;
 569
 570        syscall_nr = syscall_get_nr(current, regs);
 571        if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))
 572                return;
 573
 574        sys_data = syscall_nr_to_meta(syscall_nr);
 575        if (!sys_data)
 576                return;
 577
 578        /* We can probably do that at build time */
 579        size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
 580        size -= sizeof(u32);
 581
 582        /*
 583         * Impossible, but be paranoid with the future
 584         * How to put this check outside runtime?
 585         */
 586        if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
 587                "exit event has grown above perf buffer size"))
 588                return;
 589
 590        rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
 591                                sys_data->exit_event->event.type, regs, &rctx);
 592        if (!rec)
 593                return;
 594
 595        rec->nr = syscall_nr;
 596        rec->ret = syscall_get_return_value(current, regs);
 597
 598        head = this_cpu_ptr(sys_data->exit_event->perf_events);
 599        perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
 600}
 601
 602int perf_sysexit_enable(struct ftrace_event_call *call)
 603{
 604        int ret = 0;
 605        int num;
 606
 607        num = ((struct syscall_metadata *)call->data)->syscall_nr;
 608
 609        mutex_lock(&syscall_trace_lock);
 610        if (!sys_perf_refcount_exit)
 611                ret = register_trace_sys_exit(perf_syscall_exit, NULL);
 612        if (ret) {
 613                pr_info("event trace: Could not activate"
 614                                "syscall exit trace point");
 615        } else {
 616                set_bit(num, enabled_perf_exit_syscalls);
 617                sys_perf_refcount_exit++;
 618        }
 619        mutex_unlock(&syscall_trace_lock);
 620        return ret;
 621}
 622
 623void perf_sysexit_disable(struct ftrace_event_call *call)
 624{
 625        int num;
 626
 627        num = ((struct syscall_metadata *)call->data)->syscall_nr;
 628
 629        mutex_lock(&syscall_trace_lock);
 630        sys_perf_refcount_exit--;
 631        clear_bit(num, enabled_perf_exit_syscalls);
 632        if (!sys_perf_refcount_exit)
 633                unregister_trace_sys_exit(perf_syscall_exit, NULL);
 634        mutex_unlock(&syscall_trace_lock);
 635}
 636
 637#endif /* CONFIG_PERF_EVENTS */
 638
 639static int syscall_enter_register(struct ftrace_event_call *event,
 640                                 enum trace_reg type)
 641{
 642        switch (type) {
 643        case TRACE_REG_REGISTER:
 644                return reg_event_syscall_enter(event);
 645        case TRACE_REG_UNREGISTER:
 646                unreg_event_syscall_enter(event);
 647                return 0;
 648
 649#ifdef CONFIG_PERF_EVENTS
 650        case TRACE_REG_PERF_REGISTER:
 651                return perf_sysenter_enable(event);
 652        case TRACE_REG_PERF_UNREGISTER:
 653                perf_sysenter_disable(event);
 654                return 0;
 655#endif
 656        }
 657        return 0;
 658}
 659
 660static int syscall_exit_register(struct ftrace_event_call *event,
 661                                 enum trace_reg type)
 662{
 663        switch (type) {
 664        case TRACE_REG_REGISTER:
 665                return reg_event_syscall_exit(event);
 666        case TRACE_REG_UNREGISTER:
 667                unreg_event_syscall_exit(event);
 668                return 0;
 669
 670#ifdef CONFIG_PERF_EVENTS
 671        case TRACE_REG_PERF_REGISTER:
 672                return perf_sysexit_enable(event);
 673        case TRACE_REG_PERF_UNREGISTER:
 674                perf_sysexit_disable(event);
 675                return 0;
 676#endif
 677        }
 678        return 0;
 679}
 680
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.