[RFC bpf-next] Speedup for verifier.c:do_misc_fixups

* [RFC bpf-next] Speedup for verifier.c:do_misc_fixups
@ 2022-06-21 23:30 Eduard Zingerman
  2022-06-23  3:11 ` Alexei Starovoitov
  0 siblings, 1 reply; 5+ messages in thread
From: Eduard Zingerman @ 2022-06-21 23:30 UTC (permalink / raw)
  To: bpf, ast, andrii, daniel, kernel-team

Hi Everyone,

Below I describe a scenario when function `verifier.c:do_misc_fixups`
exhibits O(n**2) performance for certain BPF programs. I also describe
a possible adjustment for the `verifier.c:do_misc_fixups` to avoid
such behavior. I can work on this adjustment in my spare time for a
few weeks if the community is interested.

The function `verifier.c:do_misc_fixups` uses
`verifier.c:bpf_patch_insn_data` to replace single instructions with a
series of instructions. The `verifier.c:bpf_patch_insn_data` is a
wrapper for the function `core.c:bpf_patch_insn_single`. The latter
function operates in the following manner:
- allocates new instructions array of size `old size + patch size`;
- copies old instructions;
- copies patch instructions;
- adjusts offset fields for all instructions.

This algorithm means that for pathological BPF programs the
`verifier.c:do_misc_fixups` would demonstrate running time
proportional to the square of the program length.
An example of such program looks as follows:

BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_jiffies64);
... N times ...
BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_jiffies64);
BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 0);
BPF_EXIT_INSN();

`verifier.c:do_misc_fixups` replaces each call to jiffies by 3
instructions. Hence the code above would lead to the copying of
(N + N+2 + N+4 ... + N+2N) bytes, which is O(n**2).

Experimental results confirm this observation.  Here is the output of
the demo program:

  prog len     verif time usec
       128          1461
       256 x2       4132 x2.8
       512 x2      12510 x3.0
      1024 x2      44022 x3.5
      2048 x2     170479 x3.9
      4096 x2     646766 x3.8
      8192 x2    2557379 x4.0

The source code for this program is at the end of this email.

The following technique could be used to improve the running time of
the `verifier.c:do_misc_fixups`:
- patches are not applied immediately but are accumulated;
- patches are stored in the intermediate array; the size of this array
  is doubled when the capacity for the new patch is insufficient;
- patches are applied at once at the end of the `verifier.c:do_misc_fixups`;
- intermediate data structure is constructed to efficiently map
  between old and new instruction indexes;
- instruction offset fields are updated using this data structure.

In terms of the C code, this could look as follows:

/* Describes a single patch:
 * BPF instruction at index `offset` is replaced by
 * a series of the instructions pointed to by `insns`.
 */
struct bpf_patch {
  int offset;             /* index inside the original program,
                           * the instruction at this index would be replaced.
                           */
  int insns_count;        /* size of the patch */
  int delta;              /* difference in indexes between original program and
                           * new program after application of all patches up to
                           * and including this one.
                           */
  struct bpf_insn *insns; /* patch instructions */
};

/* Used to accumulate patches */
struct bpf_patching_state {
  struct bpf_patch *patches; /* array of all patches*/
  int patches_count;         /* current amount of patches */
  int patches_capacity;      /* total capacity of the patches array */
  struct bpf_insn *insns;    /* array of patch instructions,
                              * bpf_patch::insns points to the middle of this array
                              */
  int insns_count;           /* first free index in the instructions array */
  int insns_capacity;        /* total capacity of the instructions array */
};

/* Allocates `patches` and `insns` arrays with some initial capacity */
static int init_patching_state(struct bpf_patching_state *state)
{ ... }

/* Adds a patch record to the `bpf_patching_state::patches` array.
 * If array capacity is insufficient, its size is doubled.
 * Copies `insns` to the end of the `bpf_patching_state::insns`.
 * If array capacity is insufficient, its size is doubled.
 *
 * It is assumed that patches are added in increasing order of
 * the `bpf_patch::offset` field.
 */
static int add_patch(struct bpf_patching_state *state,
                     int offset,
                     int insns_count,
                     struct bpf_insn *insns)
{ ... }

/* - fills in the `bpf_patch::delta` fields for all patches in `state`.
 * - allocates new program
 * - copies program and patch instructions using the `patch_and_copy_insn` function
 */
static struct bpf_insn* apply_patches(struct bpf_patching_state *state,
                                      struct bpf_insn *prog,
                                      int *prog_size) {
{
  int delta = 0;
  for (int i = 0; i < state->patches_count; ++i) {
    delta += state->patches[i].insns_count - 1;
    state->patches[i].delta = delta;
  }
  ...
}

/* Uses binary search to find `bpf_patch::delta` corresponding to `index`.
 * `index` stands for the index of instruction in the original program.
 * Looks for the closest `state->patches[?]->offset <= index` and returns it's `delta`.
 */
static int lookup_delta_for_index(struct bpf_patching_state *state, int index)
{ ... }

/* Copies instruction at `src` to instruction at `dst` and adjusts `dst->off` field.
 * `pc` stands for the instruction index of `src` in the original program.
 */
static void patch_and_copy_insn(struct bpf_patching_state *state,
                                int pc,
                                struct bpf_insn *dst,
                                struct bpf_insn *src)
{
  int new_pc = pc + lookup_delta_for_index(state, pc);
  int new_dst = pc + dst->off + lookup_delta_for_index(state, pc + dst->off);

  memcpy(dst, src, sizeof(struct bpf_insn));
  dst->off = new_dst - new_pc;
}

Please let me know what you think about this change and whether or not
it should be implemented.

Best regards,
Eduard Zingerman

---
// Demo program
---

#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/resource.h>
#include <errno.h>
#include <stdlib.h>

#include <bpf/bpf.h>
#include <bpf/libbpf.h>
#include <linux/filter.h>

#define BPF_FUNC_jiffies64 118

const int AVG_TRIES = 50;
const int MAX_ITERS = 6;
int verbose = 0;

static void stop(char* description) {
  fprintf(stderr, "%s: %s\n", description, strerror(errno));
  exit(1);
}

static struct bpf_insn *gen_prog(int size, int *real_size) {
  int i = 0;
  *real_size = size + 2;
  struct bpf_insn *prog = calloc(*real_size, sizeof(struct bpf_insn));

  if (!prog)
    stop("can't allocate prog");

  while (i < size)
    prog[i++] = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_jiffies64);

  prog[i++] = BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 0);
  prog[i++] = BPF_EXIT_INSN();

  return prog;
}

static int get_verif_time_usec(struct bpf_insn *prog, int real_prog_len) {
  LIBBPF_OPTS(bpf_prog_load_opts, opts);
  char log[4096];

  opts.log_level = verbose ? 1 | 4 : 4;
  opts.log_buf = log;
  opts.log_size = sizeof(log);

  int prog_fd = bpf_prog_load(BPF_PROG_TYPE_TRACEPOINT,
                              NULL, "GPL", prog, real_prog_len, &opts);
  log[sizeof(log)-1] = 0;

  if (verbose)
    printf("--- log start ---\n%s--- log end   ---\n", log);

  if (prog_fd < 0)
    stop("can't load prog");

  close(prog_fd);

  int verif_time_usec = 0;
  if (sscanf(log, "verification time %d usec", &verif_time_usec) != 1)
    stop("can't get verification time from log");

  return verif_time_usec;
}

static int cmpint(const void *a, const void *b) {
  return *(int*)a - *(int*)b;
}

static int get_avg_verif_time_usec(int prog_len) {
  int results[AVG_TRIES];
  int real_prog_len;
  struct bpf_insn *prog = gen_prog(prog_len, &real_prog_len);
  for (int i = 0; i < AVG_TRIES; ++i) {
    results[i] = get_verif_time_usec(prog, real_prog_len);
  }
  free(prog);
  qsort(results, AVG_TRIES, sizeof(int), cmpint);
  int total_usec = 0;
  int samples_num = 0;
  for (int i = AVG_TRIES / 4; i < (3 * AVG_TRIES / 4); ++i) {
    total_usec += results[i];
    samples_num += 1;
  }
  return total_usec / samples_num;
}

int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) {
  return vfprintf(stderr, format, args);
}

int main(int argc, char **argv) {
  libbpf_set_print(libbpf_print_fn);
  printf("%10s     %s\n", "prog len", "verif time usec");
  int prev_prog_len = -1;
  int prev_time = -1;
  for (int i = 0; i <= MAX_ITERS; ++i) {
    int prog_len = (1 << i) * 128;
    int avg_verif_time_usec = get_avg_verif_time_usec(prog_len);
    if (prev_prog_len >= 0)
      printf("%10d x%1.f %10d x%1.1f\n",
             prog_len,
             ((double)prog_len / prev_prog_len),
             avg_verif_time_usec,
             ((double)avg_verif_time_usec / prev_time));
    else
      printf("%10d    %10d\n", prog_len, avg_verif_time_usec);
    prev_prog_len = prog_len;
    prev_time = avg_verif_time_usec;
  }
  return 0;
}

^ permalink raw reply	[flat|nested] 5+ messages in thread