/*-
* Copyright (c) 2010-2020 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This material is based upon work partially supported by The
* NetBSD Foundation under a contract with Mindaugas Rasiukevicius.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* BPF byte-code generation for NPF rules.
*
* Overview
*
* Each NPF rule is compiled into a BPF micro-program. There is a
* BPF byte-code fragment for each higher-level filtering logic,
* e.g. to match L4 protocol, IP/mask, etc. The generation process
* combines multiple BPF-byte code fragments into one program.
*
* Basic case
*
* Consider a basic case where all filters should match. They
* are expressed as logical conjunction, e.g.:
*
* A and B and C and D
*
* Each test (filter) criterion can be evaluated to true (match) or
* false (no match) and the logic is as follows:
*
* - If the value is true, then jump to the "next" test (offset 0).
*
* - If the value is false, then jump to the JUMP_MAGIC value (0xff).
* This "magic" value is used to indicate that it will have to be
* patched at a later stage.
*
* Once all byte-code fragments are combined into one, then there
* are two additional steps:
*
* - Two instructions are appended at the end of the program: "return
* success" followed by "return failure".
*
* - All jumps with the JUMP_MAGIC value are patched to point to the
* "return failure" instruction.
*
* Therefore, if all filter criteria will match, then the first
* instruction will be reached, indicating a successful match of the
* rule. Otherwise, if any of the criteria will not match, it will
* take the failure path and the rule will not be matching.
*
* Grouping
*
* Filters can have groups, which have an effect of logical
* disjunction, e.g.:
*
* A and B and (C or D)
*
* In such case, the logic inside the group has to be inverted i.e.
* the jump values swapped. If the test value is true, then jump
* out of the group; if false, then jump "next". At the end of the
* group, an addition failure path is appended and the JUMP_MAGIC
* uses within the group are patched to jump past the said path.
*
* For multi-word comparisons (IPv6 addresses), there is another
* layer of grouping:
*
* A and B and ((C and D) or (E and F))
*
* This strains the simple-minded JUMP_MAGIC logic, so for now,
* when generating the jump-if-false targets for (C and D), we
* simply count the number of instructions left to skip over.
*
* A better architecture might be to create asm-type labels for
* the jt and jf continuations in the first pass, and then, once
* their offsets are determined, go back and fill them in in the
* second pass. This would simplify the logic (no need to compute
* exactly how many instructions we're about to generate in a
* chain of conditionals) and eliminate redundant RET #0
* instructions which are currently generated after some groups.
*/
#include <sys/cdefs.h>
__RCSID("$NetBSD: npf_bpf_comp.c,v 1.19 2025/07/10 11:44:12 joe Exp $");
/*
* Note: clear X_EQ_L4OFF when register X is invalidated i.e. it stores
* something other than L4 header offset. Generally, when BPF_LDX is used.
*/
#define FETCHED_L3 0x01
#define CHECKED_L4_PROTO 0x02
#define X_EQ_L4OFF 0x04
#define FETCHED_L2 0x08
struct npf_bpf {
/*
* BPF program code, the allocated length (in bytes), the number
* of logical blocks and the flags.
*/
struct bpf_program prog;
size_t alen;
unsigned nblocks;
sa_family_t af;
uint32_t flags;
uint8_t eth_type;
/*
* Indicators whether we are inside the group and whether this
* group is implementing inverted logic.
*
* The current group offset (counted in BPF instructions)
* and block number at the start of the group.
*/
unsigned ingroup;
bool invert;
bool multiword;
unsigned goff;
unsigned gblock;
/* BPF marks, allocated length and the real length. */
uint32_t * marks;
size_t malen;
size_t mlen;
};
/*
* NPF success and failure values to be returned from BPF.
*/
#define NPF_BPF_SUCCESS ((u_int)-1)
#define NPF_BPF_FAILURE 0
/*
* Magic value to indicate the failure path, which is fixed up on completion.
* Note: this is the longest jump offset in BPF, since the offset is one byte.
*/
#define JUMP_MAGIC 0xff
for (u_int i = start; i < end; i++) {
struct bpf_insn *insn = &bp->bf_insns[i];
const u_int fail_off = end - i;
bool seen_magic = false;
if (fail_off >= JUMP_MAGIC) {
errx(EXIT_FAILURE, "BPF generation error: "
"the number of instructions is over the limit");
}
if (BPF_CLASS(insn->code) != BPF_JMP) {
continue;
}
if (BPF_OP(insn->code) == BPF_JA) {
/*
* BPF_JA can be used to jump to the failure path.
* If we are swapping i.e. inside the group, then
* jump "next"; groups have a failure path appended
* at their end.
*/
if (insn->k == JUMP_MAGIC) {
insn->k = swap ? 0 : fail_off;
}
continue;
}
/*
* Fixup the "magic" value. Swap only the "magic" jumps.
*/
/*
* If we're not inverting, there were only zero or one options,
* and the last comparison was not a multi-word comparison
* requiring a fallthrough failure -- nothing to do.
*/
if (!ctx->invert &&
(ctx->nblocks - ctx->gblock) <= 1 &&
!ctx->multiword) {
ctx->goff = ctx->gblock = 0;
return;
}
/*
* If inverting, then prepend a jump over the statement below.
* On match, it will skip-through and the fail path will be taken.
*/
if (ctx->invert) {
struct bpf_insn insns_ret[] = {
BPF_STMT(BPF_JMP+BPF_JA, 1),
};
add_insns(ctx, insns_ret, __arraycount(insns_ret));
}
/*
* Append a failure return as a fall-through i.e. if there is
* no match within the group.
*/
struct bpf_insn insns_ret[] = {
BPF_STMT(BPF_RET+BPF_K, NPF_BPF_FAILURE),
};
add_insns(ctx, insns_ret, __arraycount(insns_ret));
/*
* Adjust jump offsets: on match - jump outside the group i.e.
* to the current offset. Otherwise, jump to the next instruction
* which would lead to the fall-through code above if none matches.
*/
fixup_jumps(ctx, ctx->goff, curoff, true);
ctx->goff = ctx->gblock = 0;
}
switch (af) {
case AF_INET:
ver = IPVERSION;
break;
case AF_INET6:
ver = IPV6_VERSION >> 4;
break;
case AF_UNSPEC:
ver = 0;
break;
default:
abort();
}
/*
* The memory store is populated with:
* - BPF_MW_IPVER: IP version (4 or 6).
* - BPF_MW_L4OFF: L4 header offset.
* - BPF_MW_L4PROTO: L4 protocol.
*/
if ((ctx->flags & FETCHED_L3) == 0 || (af && ctx->af == 0)) {
const uint8_t jt = ver ? 0 : JUMP_MAGIC;
const uint8_t jf = ver ? JUMP_MAGIC : 0;
const bool ingroup = ctx->ingroup != 0;
const bool invert = ctx->invert;
/*
* L3 block cannot be inserted in the middle of a group.
* In fact, it never is. Check and start the group after.
*/
if (ingroup) {
assert(ctx->nblocks == ctx->gblock);
npfctl_bpf_group_exit(ctx);
}
/*
* A <- IP version; A == expected-version?
* If no particular version specified, check for non-zero.
*/
struct bpf_insn insns_af[] = {
BPF_STMT(BPF_LD+BPF_W+BPF_MEM, BPF_MW_IPVER),
BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, ver, jt, jf),
};
add_insns(ctx, insns_af, __arraycount(insns_af));
ctx->flags |= FETCHED_L3;
ctx->af = af;
if (af) {
uint32_t mwords[] = { BM_IPVER, 1, af };
add_bmarks(ctx, mwords, sizeof(mwords));
}
if (ingroup) {
npfctl_bpf_group_enter(ctx, invert);
}
} else if (af && af != ctx->af) {
errx(EXIT_FAILURE, "address family mismatch");
}
const uint8_t jt = type ? 0 : JUMP_MAGIC;
const uint8_t jf = type ? JUMP_MAGIC : 0;
const bool ingroup = ctx->ingroup != 0;
const bool invert = ctx->invert;
unsigned off = offsetof(struct ether_header, ether_type);
/*
* L2 block cannot be inserted in the middle of a group.
* Check and start the group after.
*/
if (ingroup) {
assert(ctx->nblocks == ctx->gblock);
npfctl_bpf_group_exit(ctx);
}
/* CAUTION: BPF operates in host byte-order. */
for (unsigned i = 0; i < nwords; i++) {
const unsigned woff = i * sizeof(uint32_t);
uint32_t word = ntohl(awords[i]);
uint32_t wordmask;
if (length >= 32) {
/* The mask is a full word - do not apply it. */
wordmask = 0;
length -= 32;
} else if (length) {
wordmask = 0xffffffff << (32 - length);
length = 0;
} else {
/* The mask became zero - skip the rest. */
break;
}
/* A <- IP address (or one word of it) */
struct bpf_insn insns_ip[] = {
BPF_STMT(BPF_LD+BPF_W+BPF_ABS, off + woff),
};
add_insns(ctx, insns_ip, __arraycount(insns_ip));
/* A <- (A & MASK) */
if (wordmask) {
struct bpf_insn insns_mask[] = {
BPF_STMT(BPF_ALU+BPF_AND+BPF_K, wordmask),
};
add_insns(ctx, insns_mask, __arraycount(insns_mask));
}
/*
* Determine how many instructions we have to jump
* ahead if the match fails.
*
* - If this is the last word, we jump to the final
* failure, JUMP_MAGIC.
*
* - If this is not the last word, we jump past the
* remaining instructions to match this sequence.
* Each 32-bit word in the sequence takes two
* instructions (BPF_LD and BPF_JMP). If there is a
* partial-word mask ahead, there will be one
* additional instruction (BPF_ALU).
*/
uint8_t jf;
if (i + 1 == (origlength + 31)/32) {
jf = JUMP_MAGIC;
} else {
jf = 2*((origlength + 31)/32 - i - 1);
if (origlength % 32 != 0 && wordmask == 0)
jf += 1;
}
/* A == expected-IP-word ? */
struct bpf_insn insns_cmp[] = {
BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, word, 0, jf),
};
add_insns(ctx, insns_cmp, __arraycount(insns_cmp));
}
/*
* If we checked a chain of words in sequence, mark this as a
* multi-word comparison so if this is in a group there will be
* a fallthrough case.
*
* XXX This is a little silly; the compiler should really just
* record holes where conditional jumps need success/failure
* continuations, and go back to fill in the holes when the
* locations of the continuations are determined later. But
* that requires restructuring this code a little more.
*/
ctx->multiword = (origlength + 31)/32 > 1;
/* copy the last two bytes of the 6 byte ether address */
memcpy(&mac_hword, (uint8_t *)ether_addr + sizeof(mac_word), sizeof(mac_hword));
mac_hword = ntohs(mac_hword);
/* load and compare first word then do same to last halfword */
struct bpf_insn insns_ether_w[] = {
BPF_STMT(BPF_LD+BPF_W+BPF_ABS, off),
BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, mac_word, 0, 2),
};
add_insns(ctx, insns_ether_w, __arraycount(insns_ether_w));
/*
* A <- L4 protocol; A == TCP? If not, jump out.
*
* Note: the TCP flag matching might be without 'proto tcp'
* when using a plain 'stateful' rule. In such case it also
* handles other protocols, thus no strict TCP check.
*/
struct bpf_insn insns_tcp[] = {
BPF_STMT(BPF_LD+BPF_W+BPF_MEM, BPF_MW_L4PROTO),
BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, IPPROTO_TCP, 0, jf),
};
add_insns(ctx, insns_tcp, __arraycount(insns_tcp));
}
/*
* npfctl_bpf_icmp: code block to match ICMP type and/or code.
* Note: suitable for both the ICMPv4 and ICMPv6.
*/
void
npfctl_bpf_icmp(npf_bpf_t *ctx, int type, int code)
{
const u_int type_off = offsetof(struct icmp, icmp_type);
const u_int code_off = offsetof(struct icmp, icmp_code);