/*      $NetBSD: bpf.c,v 1.258 2024/10/20 14:03:51 mlelstv Exp $        */

/*
* Copyright (c) 1990, 1991, 1993
*      The Regents of the University of California.  All rights reserved.
*
* This code is derived from the Stanford/CMU enet packet filter,
* (net/enet.c) distributed as part of 4.3BSD, and code contributed
* to Berkeley by Steven McCanne and Van Jacobson both of Lawrence
* Berkeley Laboratory.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
*    notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
*    notice, this list of conditions and the following disclaimer in the
*    documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
*    may be used to endorse or promote products derived from this software
*    without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
*      @(#)bpf.c       8.4 (Berkeley) 1/9/95
* static char rcsid[] =
* "Header: bpf.c,v 1.67 96/09/26 22:00:52 leres Exp ";
*/

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: bpf.c,v 1.258 2024/10/20 14:03:51 mlelstv Exp $");

#if defined(_KERNEL_OPT)
#include "opt_bpf.h"
#include "sl.h"
#include "opt_net_mpsafe.h"
#endif

#include <sys/param.h>

#include <sys/atomic.h>
#include <sys/buf.h>
#include <sys/conf.h>
#include <sys/cpu.h>
#include <sys/errno.h>
#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/ioctl.h>
#include <sys/kauth.h>
#include <sys/kernel.h>
#include <sys/lwp.h>
#include <sys/mbuf.h>
#include <sys/module.h>
#include <sys/percpu.h>
#include <sys/poll.h>
#include <sys/proc.h>
#include <sys/protosw.h>
#include <sys/pserialize.h>
#include <sys/queue.h>
#include <sys/socket.h>
#include <sys/stat.h>
#include <sys/sysctl.h>
#include <sys/syslog.h>
#include <sys/systm.h>
#include <sys/time.h>
#include <sys/tty.h>
#include <sys/uio.h>
#include <sys/vnode.h>
#include <sys/xcall.h>

#include <net/bpf.h>
#include <net/bpfdesc.h>
#include <net/bpfjit.h>
#include <net/if.h>
#include <net/if_arc.h>
#include <net/if_ether.h>
#include <net/if_types.h>
#include <net/slip.h>

#include <netinet/if_inarp.h>
#include <netinet/in.h>

#include <compat/sys/sockio.h>

#ifndef BPF_BUFSIZE
/*
* 4096 is too small for FDDI frames. 8192 is too small for gigabit Ethernet
* jumbos (circa 9k), ATM, or Intel gig/10gig ethernet jumbos (16k).
*/
# define BPF_BUFSIZE 32768
#endif

#define PRINET  26                      /* interruptible */

/*
* The default read buffer size, and limit for BIOCSBLEN, is sysctl'able.
* XXX the default values should be computed dynamically based
* on available memory size and available mbuf clusters.
*/
static int bpf_bufsize = BPF_BUFSIZE;
static int bpf_maxbufsize = BPF_DFLTBUFSIZE;    /* XXX set dynamically, see above */
static bool bpf_jit = false;

struct bpfjit_ops bpfjit_module_ops = {
       .bj_generate_code = NULL,
       .bj_free_code = NULL
};

/*
* Global BPF statistics returned by net.bpf.stats sysctl.
*/
static struct percpu    *bpf_gstats_percpu; /* struct bpf_stat */

#define BPF_STATINC(id)                                 \
       {                                               \
               struct bpf_stat *__stats =              \
                   percpu_getref(bpf_gstats_percpu);   \
               __stats->bs_##id++;                     \
               percpu_putref(bpf_gstats_percpu);       \
       }

/*
* Locking notes:
* - bpf_mtx (adaptive mutex) protects:
*   - Gobal lists: bpf_iflist and bpf_dlist
*   - struct bpf_if
*   - bpf_close
*   - bpf_psz (pserialize)
* - struct bpf_d has two mutexes:
*   - bd_buf_mtx (spin mutex) protects the buffers that can be accessed
*     on packet tapping
*   - bd_mtx (adaptive mutex) protects member variables other than the buffers
* - Locking order: bpf_mtx => bpf_d#bd_mtx => bpf_d#bd_buf_mtx
* - struct bpf_d obtained via fp->f_bpf in bpf_read and bpf_write is
*   never freed because struct bpf_d is only freed in bpf_close and
*   bpf_close never be called while executing bpf_read and bpf_write
* - A filter that is assigned to bpf_d can be replaced with another filter
*   while tapping packets, so it needs to be done atomically
* - struct bpf_d is iterated on bpf_dlist with psz
* - struct bpf_if is iterated on bpf_iflist with psz or psref
*/
/*
* Use a mutex to avoid a race condition between gathering the stats/peers
* and opening/closing the device.
*/
static kmutex_t bpf_mtx;

static struct psref_class       *bpf_psref_class __read_mostly;
static pserialize_t             bpf_psz;

static inline void
bpf_if_acquire(struct bpf_if *bp, struct psref *psref)
{

       psref_acquire(psref, &bp->bif_psref, bpf_psref_class);
}

static inline void
bpf_if_release(struct bpf_if *bp, struct psref *psref)
{

       psref_release(psref, &bp->bif_psref, bpf_psref_class);
}

/*
*  bpf_iflist is the list of interfaces; each corresponds to an ifnet
*  bpf_dtab holds the descriptors, indexed by minor device #
*/
static struct pslist_head bpf_iflist;
static struct pslist_head bpf_dlist;

/* Macros for bpf_d on bpf_dlist */
#define BPF_DLIST_WRITER_INSERT_HEAD(__d)                               \
       PSLIST_WRITER_INSERT_HEAD(&bpf_dlist, (__d), bd_bpf_dlist_entry)
#define BPF_DLIST_READER_FOREACH(__d)                                   \
       PSLIST_READER_FOREACH((__d), &bpf_dlist, struct bpf_d,          \
           bd_bpf_dlist_entry)
#define BPF_DLIST_WRITER_FOREACH(__d)                                   \
       PSLIST_WRITER_FOREACH((__d), &bpf_dlist, struct bpf_d,          \
           bd_bpf_dlist_entry)
#define BPF_DLIST_ENTRY_INIT(__d)                                       \
       PSLIST_ENTRY_INIT((__d), bd_bpf_dlist_entry)
#define BPF_DLIST_WRITER_REMOVE(__d)                                    \
       PSLIST_WRITER_REMOVE((__d), bd_bpf_dlist_entry)
#define BPF_DLIST_ENTRY_DESTROY(__d)                                    \
       PSLIST_ENTRY_DESTROY((__d), bd_bpf_dlist_entry)

/* Macros for bpf_if on bpf_iflist */
#define BPF_IFLIST_WRITER_INSERT_HEAD(__bp)                             \
       PSLIST_WRITER_INSERT_HEAD(&bpf_iflist, (__bp), bif_iflist_entry)
#define BPF_IFLIST_READER_FOREACH(__bp)                                 \
       PSLIST_READER_FOREACH((__bp), &bpf_iflist, struct bpf_if,       \
           bif_iflist_entry)
#define BPF_IFLIST_WRITER_FOREACH(__bp)                                 \
       PSLIST_WRITER_FOREACH((__bp), &bpf_iflist, struct bpf_if,       \
           bif_iflist_entry)
#define BPF_IFLIST_WRITER_REMOVE(__bp)                                  \
       PSLIST_WRITER_REMOVE((__bp), bif_iflist_entry)
#define BPF_IFLIST_ENTRY_INIT(__bp)                                     \
       PSLIST_ENTRY_INIT((__bp), bif_iflist_entry)
#define BPF_IFLIST_ENTRY_DESTROY(__bp)                                  \
       PSLIST_ENTRY_DESTROY((__bp), bif_iflist_entry)

/* Macros for bpf_d on bpf_if#bif_dlist_pslist */
#define BPFIF_DLIST_READER_FOREACH(__d, __bp)                           \
       PSLIST_READER_FOREACH((__d), &(__bp)->bif_dlist_head, struct bpf_d, \
           bd_bif_dlist_entry)
#define BPFIF_DLIST_WRITER_INSERT_HEAD(__bp, __d)                       \
       PSLIST_WRITER_INSERT_HEAD(&(__bp)->bif_dlist_head, (__d),       \
           bd_bif_dlist_entry)
#define BPFIF_DLIST_WRITER_REMOVE(__d)                                  \
       PSLIST_WRITER_REMOVE((__d), bd_bif_dlist_entry)
#define BPFIF_DLIST_ENTRY_INIT(__d)                                     \
       PSLIST_ENTRY_INIT((__d), bd_bif_dlist_entry)
#define BPFIF_DLIST_READER_EMPTY(__bp)                                  \
       (PSLIST_READER_FIRST(&(__bp)->bif_dlist_head, struct bpf_d,     \
           bd_bif_dlist_entry) == NULL)
#define BPFIF_DLIST_WRITER_EMPTY(__bp)                                  \
       (PSLIST_WRITER_FIRST(&(__bp)->bif_dlist_head, struct bpf_d,     \
           bd_bif_dlist_entry) == NULL)
#define BPFIF_DLIST_ENTRY_DESTROY(__d)                                  \
       PSLIST_ENTRY_DESTROY((__d), bd_bif_dlist_entry)

static int      bpf_allocbufs(struct bpf_d *);
static u_int    bpf_xfilter(struct bpf_filter **, void *, u_int, u_int);
static void     bpf_deliver(struct bpf_if *,
                   void *(*cpfn)(void *, const void *, size_t),
                   void *, u_int, u_int, const u_int);
static void     bpf_freed(struct bpf_d *);
static void     bpf_free_filter(struct bpf_filter *);
static void     bpf_ifname(struct ifnet *, struct ifreq *);
static void     *bpf_mcpy(void *, const void *, size_t);
static int      bpf_movein(struct ifnet *, struct uio *, int, uint64_t,
                   struct mbuf **, struct sockaddr *,
                   struct bpf_filter **);
static void     bpf_attachd(struct bpf_d *, struct bpf_if *);
static void     bpf_detachd(struct bpf_d *);
static int      bpf_setif(struct bpf_d *, struct ifreq *);
static int      bpf_setf(struct bpf_d *, struct bpf_program *, u_long);
static void     bpf_timed_out(void *);
static inline void
               bpf_wakeup(struct bpf_d *);
static int      bpf_hdrlen(struct bpf_d *);
static void     catchpacket(struct bpf_d *, u_char *, u_int, u_int,
                   void *(*)(void *, const void *, size_t),
                   struct timespec *);
static void     reset_d(struct bpf_d *);
static int      bpf_getdltlist(struct bpf_d *, struct bpf_dltlist *);
static int      bpf_setdlt(struct bpf_d *, u_int);

static int      bpf_read(struct file *, off_t *, struct uio *, kauth_cred_t,
                   int);
static int      bpf_write(struct file *, off_t *, struct uio *, kauth_cred_t,
                   int);
static int      bpf_ioctl(struct file *, u_long, void *);
static int      bpf_poll(struct file *, int);
static int      bpf_stat(struct file *, struct stat *);
static int      bpf_close(struct file *);
static int      bpf_kqfilter(struct file *, struct knote *);
static void     bpf_softintr(void *);

static const struct fileops bpf_fileops = {
       .fo_name = "bpf",
       .fo_read = bpf_read,
       .fo_write = bpf_write,
       .fo_ioctl = bpf_ioctl,
       .fo_fcntl = fnullop_fcntl,
       .fo_poll = bpf_poll,
       .fo_stat = bpf_stat,
       .fo_close = bpf_close,
       .fo_kqfilter = bpf_kqfilter,
       .fo_restart = fnullop_restart,
};

dev_type_open(bpfopen);

const struct cdevsw bpf_cdevsw = {
       .d_open = bpfopen,
       .d_close = noclose,
       .d_read = noread,
       .d_write = nowrite,
       .d_ioctl = noioctl,
       .d_stop = nostop,
       .d_tty = notty,
       .d_poll = nopoll,
       .d_mmap = nommap,
       .d_kqfilter = nokqfilter,
       .d_discard = nodiscard,
       .d_flag = D_OTHER | D_MPSAFE
};

bpfjit_func_t
bpf_jit_generate(bpf_ctx_t *bc, void *code, size_t size)
{
       struct bpfjit_ops *ops = &bpfjit_module_ops;
       bpfjit_func_t (*generate_code)(const bpf_ctx_t *,
           const struct bpf_insn *, size_t);

       generate_code = atomic_load_acquire(&ops->bj_generate_code);
       if (generate_code != NULL) {
               return generate_code(bc, code, size);
       }
       return NULL;
}

void
bpf_jit_freecode(bpfjit_func_t jcode)
{
       KASSERT(bpfjit_module_ops.bj_free_code != NULL);
       bpfjit_module_ops.bj_free_code(jcode);
}

static int
bpf_movein(struct ifnet *ifp, struct uio *uio, int linktype, uint64_t mtu,
   struct mbuf **mp, struct sockaddr *sockp, struct bpf_filter **wfilter)
{
       struct mbuf *m, *m0, *n;
       int error;
       size_t len;
       size_t hlen;
       size_t align;
       u_int slen;

       /*
        * Build a sockaddr based on the data link layer type.
        * We do this at this level because the ethernet header
        * is copied directly into the data field of the sockaddr.
        * In the case of SLIP, there is no header and the packet
        * is forwarded as is.
        * Also, we are careful to leave room at the front of the mbuf
        * for the link level header.
        */
       switch (linktype) {

       case DLT_SLIP:
               sockp->sa_family = AF_INET;
               hlen = 0;
               align = 0;
               break;

       case DLT_PPP:
               sockp->sa_family = AF_UNSPEC;
               hlen = 0;
               align = 0;
               break;

       case DLT_EN10MB:
               sockp->sa_family = AF_UNSPEC;
               /* XXX Would MAXLINKHDR be better? */
               /* 6(dst)+6(src)+2(type) */
               hlen = sizeof(struct ether_header);
               align = 2;
               break;

       case DLT_ARCNET:
               sockp->sa_family = AF_UNSPEC;
               hlen = ARC_HDRLEN;
               align = 5;
               break;

       case DLT_FDDI:
               sockp->sa_family = AF_LINK;
               /* XXX 4(FORMAC)+6(dst)+6(src) */
               hlen = 16;
               align = 0;
               break;

       case DLT_ECONET:
               sockp->sa_family = AF_UNSPEC;
               hlen = 6;
               align = 2;
               break;

       case DLT_NULL:
               sockp->sa_family = AF_UNSPEC;
               if (ifp->if_type == IFT_LOOP) {
                       /* Set here to apply the following validations */
                       hlen = sizeof(uint32_t);
               } else
                       hlen = 0;
               align = 0;
               break;

       default:
               return (EIO);
       }

       len = uio->uio_resid;
       /*
        * If there aren't enough bytes for a link level header or the
        * packet length exceeds the interface mtu, return an error.
        */
       if (len - hlen > mtu)
               return (EMSGSIZE);

       m0 = m = m_gethdr(M_WAIT, MT_DATA);
       m_reset_rcvif(m);
       m->m_pkthdr.len = (int)(len - hlen);
       if (len + align > MHLEN) {
               m_clget(m, M_WAIT);
               if ((m->m_flags & M_EXT) == 0) {
                       error = ENOBUFS;
                       goto bad;
               }
       }

       /* Ensure the data is properly aligned */
       if (align > 0)
               m->m_data += align;

       for (;;) {
               len = M_TRAILINGSPACE(m);
               if (len > uio->uio_resid)
                       len = uio->uio_resid;
               error = uiomove(mtod(m, void *), len, uio);
               if (error)
                       goto bad;
               m->m_len = len;

               if (uio->uio_resid == 0)
                       break;

               n = m_get(M_WAIT, MT_DATA);
               m_clget(n, M_WAIT);     /* if fails, there is no problem */
               m->m_next = n;
               m = n;
       }

       slen = bpf_xfilter(wfilter, mtod(m, u_char *), len, len);
       if (slen == 0) {
               error = EPERM;
               goto bad;
       }

       if (hlen != 0) {
               if (linktype == DLT_NULL && ifp->if_type == IFT_LOOP) {
                       uint32_t af;
                       /* the link header indicates the address family */
                       memcpy(&af, mtod(m0, void *), sizeof(af));
                       sockp->sa_family = af;
               } else {
                       /* move link level header in the top of mbuf to sa_data */
                       memcpy(sockp->sa_data, mtod(m0, void *), hlen);
               }
               m0->m_data += hlen;
               m0->m_len -= hlen;
       }

       m_claimm(m, ifp->if_mowner);

       *mp = m0;
       return (0);

bad:
       m_freem(m0);
       return (error);
}

/*
* Attach file to the bpf interface, i.e. make d listen on bp.
*/
static void
bpf_attachd(struct bpf_d *d, struct bpf_if *bp)
{
       struct bpf_event_tracker *t;

       KASSERT(mutex_owned(&bpf_mtx));
       KASSERT(mutex_owned(d->bd_mtx));
       /*
        * Point d at bp, and add d to the interface's list of listeners.
        * Finally, point the driver's bpf cookie at the interface so
        * it will divert packets to bpf.
        */
       d->bd_bif = bp;
       BPFIF_DLIST_WRITER_INSERT_HEAD(bp, d);

       *bp->bif_driverp = bp;

       SLIST_FOREACH(t, &bp->bif_trackers, bet_entries) {
               t->bet_notify(bp, bp->bif_ifp, bp->bif_dlt,
                   BPF_TRACK_EVENT_ATTACH);
       }
}

/*
* Detach a file from its interface.
*/
static void
bpf_detachd(struct bpf_d *d)
{
       struct bpf_if *bp;
       struct bpf_event_tracker *t;

       KASSERT(mutex_owned(&bpf_mtx));
       KASSERT(mutex_owned(d->bd_mtx));

       bp = d->bd_bif;
       /*
        * Check if this descriptor had requested promiscuous mode.
        * If so, turn it off.
        */
       if (d->bd_promisc) {
               int error __diagused;

               d->bd_promisc = 0;
               /*
                * Take device out of promiscuous mode.  Since we were
                * able to enter promiscuous mode, we should be able
                * to turn it off.  But we can get an error if
                * the interface was configured down, so only panic
                * if we don't get an unexpected error.
                */
               KERNEL_LOCK_UNLESS_NET_MPSAFE();
               error = ifpromisc(bp->bif_ifp, 0);
               KERNEL_UNLOCK_UNLESS_NET_MPSAFE();
#ifdef DIAGNOSTIC
               if (error)
                       printf("%s: ifpromisc failed: %d", __func__, error);
#endif
       }

       /* Remove d from the interface's descriptor list. */
       BPFIF_DLIST_WRITER_REMOVE(d);

       pserialize_perform(bpf_psz);

       if (BPFIF_DLIST_WRITER_EMPTY(bp)) {
               /*
                * Let the driver know that there are no more listeners.
                */
               *d->bd_bif->bif_driverp = NULL;
       }

       d->bd_bif = NULL;

       SLIST_FOREACH(t, &bp->bif_trackers, bet_entries) {
               t->bet_notify(bp, bp->bif_ifp, bp->bif_dlt,
                   BPF_TRACK_EVENT_DETACH);
       }
}

static void
bpf_init(void)
{

       mutex_init(&bpf_mtx, MUTEX_DEFAULT, IPL_NONE);
       bpf_psz = pserialize_create();
       bpf_psref_class = psref_class_create("bpf", IPL_SOFTNET);

       PSLIST_INIT(&bpf_iflist);
       PSLIST_INIT(&bpf_dlist);

       bpf_gstats_percpu = percpu_alloc(sizeof(struct bpf_stat));

       return;
}

/*
* bpfilterattach() is called at boot time.  We don't need to do anything
* here, since any initialization will happen as part of module init code.
*/
/* ARGSUSED */
void
bpfilterattach(int n)
{

}

/*
* Open ethernet device. Clones.
*/
/* ARGSUSED */
int
bpfopen(dev_t dev, int flag, int mode, struct lwp *l)
{
       struct bpf_d *d;
       struct file *fp;
       int error, fd;

       /* falloc() will fill in the descriptor for us. */
       if ((error = fd_allocfile(&fp, &fd)) != 0)
               return error;

       d = kmem_zalloc(sizeof(*d), KM_SLEEP);
       d->bd_bufsize = bpf_bufsize;
       d->bd_direction = BPF_D_INOUT;
       d->bd_feedback = 0;
       d->bd_pid = l->l_proc->p_pid;
#ifdef _LP64
       if (curproc->p_flag & PK_32)
               d->bd_compat32 = 1;
#endif
       getnanotime(&d->bd_btime);
       d->bd_atime = d->bd_mtime = d->bd_btime;
       callout_init(&d->bd_callout, CALLOUT_MPSAFE);
       selinit(&d->bd_sel);
       d->bd_sih = softint_establish(SOFTINT_CLOCK, bpf_softintr, d);
       d->bd_jitcode = NULL;
       d->bd_rfilter = NULL;
       d->bd_wfilter = NULL;
       d->bd_locked = 0;
       BPF_DLIST_ENTRY_INIT(d);
       BPFIF_DLIST_ENTRY_INIT(d);
       d->bd_mtx = mutex_obj_alloc(MUTEX_DEFAULT, IPL_SOFTNET);
       d->bd_buf_mtx = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NET);
       cv_init(&d->bd_cv, "bpf");

       mutex_enter(&bpf_mtx);
       BPF_DLIST_WRITER_INSERT_HEAD(d);
       mutex_exit(&bpf_mtx);

       return fd_clone(fp, fd, flag, &bpf_fileops, d);
}

/*
* Close the descriptor by detaching it from its interface,
* deallocating its buffers, and marking it free.
*/
/* ARGSUSED */
static int
bpf_close(struct file *fp)
{
       struct bpf_d *d;

       mutex_enter(&bpf_mtx);

       if ((d = fp->f_bpf) == NULL) {
               mutex_exit(&bpf_mtx);
               return 0;
       }

       /*
        * Refresh the PID associated with this bpf file.
        */
       d->bd_pid = curproc->p_pid;

       mutex_enter(d->bd_buf_mtx);
       if (d->bd_state == BPF_WAITING)
               callout_halt(&d->bd_callout, d->bd_buf_mtx);
       d->bd_state = BPF_IDLE;
       mutex_exit(d->bd_buf_mtx);
       mutex_enter(d->bd_mtx);
       if (d->bd_bif)
               bpf_detachd(d);
       mutex_exit(d->bd_mtx);

       BPF_DLIST_WRITER_REMOVE(d);

       pserialize_perform(bpf_psz);
       mutex_exit(&bpf_mtx);

       BPFIF_DLIST_ENTRY_DESTROY(d);
       BPF_DLIST_ENTRY_DESTROY(d);
       fp->f_bpf = NULL;
       bpf_freed(d);
       callout_destroy(&d->bd_callout);
       seldestroy(&d->bd_sel);
       softint_disestablish(d->bd_sih);
       mutex_obj_free(d->bd_mtx);
       mutex_obj_free(d->bd_buf_mtx);
       cv_destroy(&d->bd_cv);

       kmem_free(d, sizeof(*d));

       return (0);
}

/*
* Rotate the packet buffers in descriptor d.  Move the store buffer
* into the hold slot, and the free buffer into the store slot.
* Zero the length of the new store buffer.
*/
#define ROTATE_BUFFERS(d) \
       (d)->bd_hbuf = (d)->bd_sbuf; \
       (d)->bd_hlen = (d)->bd_slen; \
       (d)->bd_sbuf = (d)->bd_fbuf; \
       (d)->bd_slen = 0; \
       (d)->bd_fbuf = NULL;
/*
*  bpfread - read next chunk of packets from buffers
*/
static int
bpf_read(struct file *fp, off_t *offp, struct uio *uio,
   kauth_cred_t cred, int flags)
{
       struct bpf_d *d = fp->f_bpf;
       int timed_out;
       int error;

       /*
        * Refresh the PID associated with this bpf file.
        */
       d->bd_pid = curproc->p_pid;

       getnanotime(&d->bd_atime);
       /*
        * Restrict application to use a buffer the same size as
        * the kernel buffers.
        */
       if (uio->uio_resid != d->bd_bufsize)
               return (EINVAL);

       mutex_enter(d->bd_buf_mtx);
       if (d->bd_state == BPF_WAITING)
               callout_halt(&d->bd_callout, d->bd_buf_mtx);
       timed_out = (d->bd_state == BPF_TIMED_OUT);
       d->bd_state = BPF_IDLE;
       mutex_exit(d->bd_buf_mtx);
       /*
        * If the hold buffer is empty, then do a timed sleep, which
        * ends when the timeout expires or when enough packets
        * have arrived to fill the store buffer.
        */
       mutex_enter(d->bd_buf_mtx);
       while (d->bd_hbuf == NULL) {
               if (fp->f_flag & FNONBLOCK) {
                       if (d->bd_slen == 0) {
                               error = EWOULDBLOCK;
                               goto out;
                       }
                       ROTATE_BUFFERS(d);
                       break;
               }

               if ((d->bd_immediate || timed_out) && d->bd_slen != 0) {
                       /*
                        * A packet(s) either arrived since the previous
                        * read or arrived while we were asleep.
                        * Rotate the buffers and return what's here.
                        */
                       ROTATE_BUFFERS(d);
                       break;
               }

               error = cv_timedwait_sig(&d->bd_cv, d->bd_buf_mtx, d->bd_rtout);

               if (error == EINTR || error == ERESTART)
                       goto out;

               if (error == EWOULDBLOCK) {
                       /*
                        * On a timeout, return what's in the buffer,
                        * which may be nothing.  If there is something
                        * in the store buffer, we can rotate the buffers.
                        */
                       if (d->bd_hbuf)
                               /*
                                * We filled up the buffer in between
                                * getting the timeout and arriving
                                * here, so we don't need to rotate.
                                */
                               break;

                       if (d->bd_slen == 0) {
                               error = 0;
                               goto out;
                       }
                       ROTATE_BUFFERS(d);
                       break;
               }
               if (error != 0)
                       goto out;
       }
       /*
        * At this point, we know we have something in the hold slot.
        */
       mutex_exit(d->bd_buf_mtx);

       /*
        * Move data from hold buffer into user space.
        * We know the entire buffer is transferred since
        * we checked above that the read buffer is bpf_bufsize bytes.
        */
       error = uiomove(d->bd_hbuf, d->bd_hlen, uio);

       mutex_enter(d->bd_buf_mtx);
       d->bd_fbuf = d->bd_hbuf;
       d->bd_hbuf = NULL;
       d->bd_hlen = 0;
out:
       mutex_exit(d->bd_buf_mtx);
       return (error);
}

/*
* If there are processes sleeping on this descriptor, wake them up.
*/
static inline void
bpf_wakeup(struct bpf_d *d)
{

       KASSERT(mutex_owned(d->bd_buf_mtx));

       cv_broadcast(&d->bd_cv);

       if (d->bd_async)
               softint_schedule(d->bd_sih);
       selnotify(&d->bd_sel, 0, NOTE_SUBMIT);
}

static void
bpf_softintr(void *cookie)
{
       struct bpf_d *d;

       d = cookie;
       if (d->bd_async)
               fownsignal(d->bd_pgid, SIGIO, 0, 0, NULL);
}

static void
bpf_timed_out(void *arg)
{
       struct bpf_d *d = arg;

       mutex_enter(d->bd_buf_mtx);
       if (d->bd_state == BPF_WAITING) {
               d->bd_state = BPF_TIMED_OUT;
               if (d->bd_slen != 0)
                       bpf_wakeup(d);
       }
       mutex_exit(d->bd_buf_mtx);
}

static int
bpf_write(struct file *fp, off_t *offp, struct uio *uio,
   kauth_cred_t cred, int flags)
{
       struct bpf_d *d = fp->f_bpf;
       struct bpf_if *bp;
       struct ifnet *ifp;
       struct mbuf *m, *mc;
       int error;
       static struct sockaddr_storage dst;
       struct psref psref;
       int bound;

       /*
        * Refresh the PID associated with this bpf file.
        */
       d->bd_pid = curproc->p_pid;

       m = NULL;       /* XXX gcc */

       bound = curlwp_bind();
       mutex_enter(d->bd_mtx);
       bp = d->bd_bif;
       if (bp == NULL) {
               mutex_exit(d->bd_mtx);
               error = ENXIO;
               goto out_bindx;
       }
       bpf_if_acquire(bp, &psref);
       mutex_exit(d->bd_mtx);

       getnanotime(&d->bd_mtime);

       ifp = bp->bif_ifp;
       if (if_is_deactivated(ifp)) {
               error = ENXIO;
               goto out;
       }

       if (uio->uio_resid == 0) {
               error = 0;
               goto out;
       }

       error = bpf_movein(ifp, uio, (int)bp->bif_dlt, ifp->if_mtu, &m,
           (struct sockaddr *) &dst, &d->bd_wfilter);
       if (error)
               goto out;

       if (m->m_pkthdr.len > ifp->if_mtu) {
               m_freem(m);
               error = EMSGSIZE;
               goto out;
       }

       /*
        * If writing to a loopback interface, the address family has
        * already been specially computed in bpf_movein(), so don't
        * clobber it, or the loopback will reject it in looutput().
        */
       if (d->bd_hdrcmplt && ifp->if_type != IFT_LOOP)
               dst.ss_family = pseudo_AF_HDRCMPLT;

       if (d->bd_feedback) {
               mc = m_dup(m, 0, M_COPYALL, M_NOWAIT);
               if (mc != NULL)
                       m_set_rcvif(mc, ifp);
               /* Set M_PROMISC for outgoing packets to be discarded. */
               if (1 /*d->bd_direction == BPF_D_INOUT*/)
                       m->m_flags |= M_PROMISC;
       } else
               mc = NULL;

       error = if_output_lock(ifp, ifp, m, (struct sockaddr *) &dst, NULL);

       if (mc != NULL) {
               if (error == 0) {
                       int s = splsoftnet();
                       KERNEL_LOCK_UNLESS_IFP_MPSAFE(ifp);
                       ifp->_if_input(ifp, mc);
                       KERNEL_UNLOCK_UNLESS_IFP_MPSAFE(ifp);
                       splx(s);
               } else
                       m_freem(mc);
       }
       /*
        * The driver frees the mbuf.
        */
out:
       bpf_if_release(bp, &psref);
out_bindx:
       curlwp_bindx(bound);
       return error;
}

/*
* Reset a descriptor by flushing its packet buffer and clearing the
* receive and drop counts.
*/
static void
reset_d(struct bpf_d *d)
{

       KASSERT(mutex_owned(d->bd_mtx));

       mutex_enter(d->bd_buf_mtx);
       if (d->bd_hbuf) {
               /* Free the hold buffer. */
               d->bd_fbuf = d->bd_hbuf;
               d->bd_hbuf = NULL;
       }
       d->bd_slen = 0;
       d->bd_hlen = 0;
       d->bd_rcount = 0;
       d->bd_dcount = 0;
       d->bd_ccount = 0;
       mutex_exit(d->bd_buf_mtx);
}

/*
*  FIONREAD            Check for read packet available.
*  BIOCGBLEN           Get buffer len [for read()].
*  BIOCSETF            Set ethernet read filter.
*  BIOCFLUSH           Flush read packet buffer.
*  BIOCPROMISC         Put interface into promiscuous mode.
*  BIOCGDLT            Get link layer type.
*  BIOCGETIF           Get interface name.
*  BIOCSETIF           Set interface.
*  BIOCSRTIMEOUT       Set read timeout.
*  BIOCGRTIMEOUT       Get read timeout.
*  BIOCGSTATS          Get packet stats.
*  BIOCIMMEDIATE       Set immediate mode.
*  BIOCVERSION         Get filter language version.
*  BIOCGHDRCMPLT       Get "header already complete" flag.
*  BIOCSHDRCMPLT       Set "header already complete" flag.
*  BIOCSFEEDBACK       Set packet feedback mode.
*  BIOCGFEEDBACK       Get packet feedback mode.
*  BIOCGDIRECTION      Get packet direction flag
*  BIOCSDIRECTION      Set packet direction flag
*/
/* ARGSUSED */
static int
bpf_ioctl(struct file *fp, u_long cmd, void *addr)
{
       struct bpf_d *d = fp->f_bpf;
       int error = 0;

       /*
        * Refresh the PID associated with this bpf file.
        */
       d->bd_pid = curproc->p_pid;
#ifdef _LP64
       if (curproc->p_flag & PK_32)
               d->bd_compat32 = 1;
       else
               d->bd_compat32 = 0;
#endif

       mutex_enter(d->bd_buf_mtx);
       if (d->bd_state == BPF_WAITING)
               callout_halt(&d->bd_callout, d->bd_buf_mtx);
       d->bd_state = BPF_IDLE;
       mutex_exit(d->bd_buf_mtx);

       if (d->bd_locked) {
               switch (cmd) {
               case BIOCGBLEN:         /* FALLTHROUGH */
               case BIOCFLUSH:         /* FALLTHROUGH */
               case BIOCGDLT:          /* FALLTHROUGH */
               case BIOCGDLTLIST:      /* FALLTHROUGH */
               case BIOCGETIF:         /* FALLTHROUGH */
               case BIOCGRTIMEOUT:     /* FALLTHROUGH */
               case BIOCGSTATS:        /* FALLTHROUGH */
               case BIOCVERSION:       /* FALLTHROUGH */
               case BIOCGHDRCMPLT:     /* FALLTHROUGH */
               case FIONREAD:          /* FALLTHROUGH */
               case BIOCLOCK:          /* FALLTHROUGH */
               case BIOCSRTIMEOUT:     /* FALLTHROUGH */
               case BIOCIMMEDIATE:     /* FALLTHROUGH */
               case TIOCGPGRP:
                       break;
               default:
                       return EPERM;
               }
       }

       switch (cmd) {

       default:
               error = EINVAL;
               break;

       /*
        * Check for read packet available.
        */
       case FIONREAD: {
               int n;

               mutex_enter(d->bd_buf_mtx);
               n = d->bd_slen;
               if (d->bd_hbuf)
                       n += d->bd_hlen;
               mutex_exit(d->bd_buf_mtx);

               *(int *)addr = n;
               break;
       }

       /*
        * Get buffer len [for read()].
        */
       case BIOCGBLEN:
               *(u_int *)addr = d->bd_bufsize;
               break;

       /*
        * Set buffer length.
        */
       case BIOCSBLEN:
               /*
                * Forbid to change the buffer length if buffers are already
                * allocated.
                */
               mutex_enter(d->bd_mtx);
               mutex_enter(d->bd_buf_mtx);
               if (d->bd_bif != NULL || d->bd_sbuf != NULL)
                       error = EINVAL;
               else {
                       u_int size = *(u_int *)addr;

                       if (size > bpf_maxbufsize)
                               *(u_int *)addr = size = bpf_maxbufsize;
                       else if (size < BPF_MINBUFSIZE)
                               *(u_int *)addr = size = BPF_MINBUFSIZE;
                       d->bd_bufsize = size;
               }
               mutex_exit(d->bd_buf_mtx);
               mutex_exit(d->bd_mtx);
               break;

       /*
        * Set link layer read filter.
        */
       case BIOCSETF:          /* FALLTHROUGH */
       case BIOCSETWF:
               error = bpf_setf(d, addr, cmd);
               break;

       case BIOCLOCK:
               d->bd_locked = 1;
               break;

       /*
        * Flush read packet buffer.
        */
       case BIOCFLUSH:
               mutex_enter(d->bd_mtx);
               reset_d(d);
               mutex_exit(d->bd_mtx);
               break;

       /*
        * Put interface into promiscuous mode.
        */
       case BIOCPROMISC:
               mutex_enter(d->bd_mtx);
               if (d->bd_bif == NULL) {
                       mutex_exit(d->bd_mtx);
                       /*
                        * No interface attached yet.
                        */
                       error = EINVAL;
                       break;
               }
               if (d->bd_promisc == 0) {
                       KERNEL_LOCK_UNLESS_NET_MPSAFE();
                       error = ifpromisc(d->bd_bif->bif_ifp, 1);
                       KERNEL_UNLOCK_UNLESS_NET_MPSAFE();
                       if (error == 0)
                               d->bd_promisc = 1;
               }
               mutex_exit(d->bd_mtx);
               break;

       /*
        * Get device parameters.
        */
       case BIOCGDLT:
               mutex_enter(d->bd_mtx);
               if (d->bd_bif == NULL)
                       error = EINVAL;
               else
                       *(u_int *)addr = d->bd_bif->bif_dlt;
               mutex_exit(d->bd_mtx);
               break;

       /*
        * Get a list of supported device parameters.
        */
       case BIOCGDLTLIST:
               mutex_enter(d->bd_mtx);
               if (d->bd_bif == NULL)
                       error = EINVAL;
               else
                       error = bpf_getdltlist(d, addr);
               mutex_exit(d->bd_mtx);
               break;

       /*
        * Set device parameters.
        */
       case BIOCSDLT:
               mutex_enter(&bpf_mtx);
               mutex_enter(d->bd_mtx);
               if (d->bd_bif == NULL)
                       error = EINVAL;
               else
                       error = bpf_setdlt(d, *(u_int *)addr);
               mutex_exit(d->bd_mtx);
               mutex_exit(&bpf_mtx);
               break;

       /*
        * Set interface name.
        */
#ifdef OBIOCGETIF
       case OBIOCGETIF:
#endif
       case BIOCGETIF:
               mutex_enter(d->bd_mtx);
               if (d->bd_bif == NULL)
                       error = EINVAL;
               else
                       bpf_ifname(d->bd_bif->bif_ifp, addr);
               mutex_exit(d->bd_mtx);
               break;

       /*
        * Set interface.
        */
#ifdef OBIOCSETIF
       case OBIOCSETIF:
#endif
       case BIOCSETIF:
               mutex_enter(&bpf_mtx);
               error = bpf_setif(d, addr);
               mutex_exit(&bpf_mtx);
               break;

       /*
        * Set read timeout.
        */
       case BIOCSRTIMEOUT: {
               struct timeval *tv = addr;

               /* Compute number of ticks. */
               if (tv->tv_sec < 0 ||
                   tv->tv_usec < 0 || tv->tv_usec >= 1000000) {
                       error = EINVAL;
                       break;
               } else if (tv->tv_sec > INT_MAX/hz - 1) {
                       d->bd_rtout = INT_MAX;
               } else {
                       d->bd_rtout = tv->tv_sec * hz + tv->tv_usec / tick;
               }
               if ((d->bd_rtout == 0) && (tv->tv_usec != 0))
                       d->bd_rtout = 1;
               break;
       }

#ifdef BIOCGORTIMEOUT
       /*
        * Get read timeout.
        */
       case BIOCGORTIMEOUT: {
               struct timeval50 *tv = addr;

               tv->tv_sec = d->bd_rtout / hz;
               tv->tv_usec = (d->bd_rtout % hz) * tick;
               break;
       }
#endif

#ifdef BIOCSORTIMEOUT
       /*
        * Set read timeout.
        */
       case BIOCSORTIMEOUT: {
               struct timeval50 *tv = addr;

               /* Compute number of ticks. */
               if (tv->tv_sec < 0 ||
                   tv->tv_usec < 0 || tv->tv_usec >= 1000000) {
                       error = EINVAL;
                       break;
               } else if (tv->tv_sec > INT_MAX/hz - 1) {
                       d->bd_rtout = INT_MAX;
               } else {
                       d->bd_rtout = tv->tv_sec * hz + tv->tv_usec / tick;
               }
               if ((d->bd_rtout == 0) && (tv->tv_usec != 0))
                       d->bd_rtout = 1;
               break;
       }
#endif

       /*
        * Get read timeout.
        */
       case BIOCGRTIMEOUT: {
               struct timeval *tv = addr;

               tv->tv_sec = d->bd_rtout / hz;
               tv->tv_usec = (d->bd_rtout % hz) * tick;
               break;
       }
       /*
        * Get packet stats.
        */
       case BIOCGSTATS: {
               struct bpf_stat *bs = addr;

               bs->bs_recv = d->bd_rcount;
               bs->bs_drop = d->bd_dcount;
               bs->bs_capt = d->bd_ccount;
               break;
       }

       case BIOCGSTATS_30: {
               struct bpf_stat30 *bs = addr;

               bs->bs_recv = d->bd_rcount;
               bs->bs_drop = d->bd_dcount;
               break;
       }

       /*
        * Set immediate mode.
        */
       case BIOCIMMEDIATE:
               d->bd_immediate = *(u_int *)addr;
               break;

       case BIOCVERSION: {
               struct bpf_version *bv = addr;

               bv->bv_major = BPF_MAJOR_VERSION;
               bv->bv_minor = BPF_MINOR_VERSION;
               break;
       }

       case BIOCGHDRCMPLT:     /* get "header already complete" flag */
               *(u_int *)addr = d->bd_hdrcmplt;
               break;

       case BIOCSHDRCMPLT:     /* set "header already complete" flag */
               d->bd_hdrcmplt = *(u_int *)addr ? 1 : 0;
               break;

       /*
        * Get packet direction flag
        */
       case BIOCGDIRECTION:
               *(u_int *)addr = d->bd_direction;
               break;

       /*
        * Set packet direction flag
        */
       case BIOCSDIRECTION: {
               u_int   direction;

               direction = *(u_int *)addr;
               switch (direction) {
               case BPF_D_IN:
               case BPF_D_INOUT:
               case BPF_D_OUT:
                       d->bd_direction = direction;
                       break;
               default:
                       error = EINVAL;
               }
       }
       break;

       /*
        * Set "feed packets from bpf back to input" mode
        */
       case BIOCSFEEDBACK:
               d->bd_feedback = *(u_int *)addr;
               break;

       /*
        * Get "feed packets from bpf back to input" mode
        */
       case BIOCGFEEDBACK:
               *(u_int *)addr = d->bd_feedback;
               break;

       case FIONBIO:           /* Non-blocking I/O */
               /*
                * No need to do anything special as we use IO_NDELAY in
                * bpfread() as an indication of whether or not to block
                * the read.
                */
               break;

       case FIOASYNC:          /* Send signal on receive packets */
               mutex_enter(d->bd_mtx);
               d->bd_async = *(int *)addr;
               mutex_exit(d->bd_mtx);
               break;

       case TIOCSPGRP:         /* Process or group to send signals to */
       case FIOSETOWN:
               error = fsetown(&d->bd_pgid, cmd, addr);
               break;

       case TIOCGPGRP:
       case FIOGETOWN:
               error = fgetown(d->bd_pgid, cmd, addr);
               break;
       }
       return (error);
}

/*
* Set d's packet filter program to fp.  If this file already has a filter,
* free it and replace it.  Returns EINVAL for bogus requests.
*/
static int
bpf_setf(struct bpf_d *d, struct bpf_program *fp, u_long cmd)
{
       struct bpf_insn *fcode;
       bpfjit_func_t jcode;
       size_t flen, size = 0;
       struct bpf_filter *oldf, *newf, **storef;

       jcode = NULL;
       flen = fp->bf_len;

       if ((fp->bf_insns == NULL && flen) || flen > BPF_MAXINSNS) {
               return EINVAL;
       }

       if (flen) {
               /*
                * Allocate the buffer, copy the byte-code from
                * userspace and validate it.
                */
               size = flen * sizeof(*fp->bf_insns);
               fcode = kmem_alloc(size, KM_SLEEP);
               if (copyin(fp->bf_insns, fcode, size) != 0 ||
                   !bpf_validate(fcode, (int)flen)) {
                       kmem_free(fcode, size);
                       return EINVAL;
               }
               if (bpf_jit)
                       jcode = bpf_jit_generate(NULL, fcode, flen);
       } else {
               fcode = NULL;
       }

       newf = kmem_alloc(sizeof(*newf), KM_SLEEP);
       newf->bf_insn = fcode;
       newf->bf_size = size;
       newf->bf_jitcode = jcode;
       if (cmd == BIOCSETF)
               d->bd_jitcode = jcode; /* XXX just for kvm(3) users */

       /* Need to hold bpf_mtx for pserialize_perform */
       mutex_enter(&bpf_mtx);
       mutex_enter(d->bd_mtx);
       if (cmd == BIOCSETWF) {
               oldf = d->bd_wfilter;
               storef = &d->bd_wfilter;
       } else {
               oldf = d->bd_rfilter;
               storef = &d->bd_rfilter;
       }
       atomic_store_release(storef, newf);
       reset_d(d);
       pserialize_perform(bpf_psz);
       mutex_exit(d->bd_mtx);
       mutex_exit(&bpf_mtx);

       if (oldf != NULL)
               bpf_free_filter(oldf);

       return 0;
}

/*
* Detach a file from its current interface (if attached at all) and attach
* to the interface indicated by the name stored in ifr.
* Return an errno or 0.
*/
static int
bpf_setif(struct bpf_d *d, struct ifreq *ifr)
{
       struct bpf_if *bp;
       char *cp;
       int unit_seen, i, error;

       KASSERT(mutex_owned(&bpf_mtx));
       /*
        * Make sure the provided name has a unit number, and default
        * it to '0' if not specified.
        * XXX This is ugly ... do this differently?
        */
       unit_seen = 0;
       cp = ifr->ifr_name;
       cp[sizeof(ifr->ifr_name) - 1] = '\0';   /* sanity */
       while (*cp++)
               if (*cp >= '0' && *cp <= '9')
                       unit_seen = 1;
       if (!unit_seen) {
               /* Make sure to leave room for the '\0'. */
               for (i = 0; i < (IFNAMSIZ - 1); ++i) {
                       if ((ifr->ifr_name[i] >= 'a' &&
                               ifr->ifr_name[i] <= 'z') ||
                           (ifr->ifr_name[i] >= 'A' &&
                               ifr->ifr_name[i] <= 'Z'))
                               continue;
                       ifr->ifr_name[i] = '0';
               }
       }

       /*
        * Look through attached interfaces for the named one.
        */
       BPF_IFLIST_WRITER_FOREACH(bp) {
               struct ifnet *ifp = bp->bif_ifp;

               if (ifp == NULL ||
                   strcmp(ifp->if_xname, ifr->ifr_name) != 0)
                       continue;
               /* skip additional entry */
               if (bp->bif_driverp != &ifp->if_bpf)
                       continue;
               /*
                * We found the requested interface.
                * Allocate the packet buffers if we need to.
                * If we're already attached to requested interface,
                * just flush the buffer.
                */
               /*
                * bpf_allocbufs is called only here. bpf_mtx ensures that
                * no race condition happen on d->bd_sbuf.
                */
               if (d->bd_sbuf == NULL) {
                       error = bpf_allocbufs(d);
                       if (error != 0)
                               return (error);
               }
               mutex_enter(d->bd_mtx);
               if (bp != d->bd_bif) {
                       if (d->bd_bif) {
                               /*
                                * Detach if attached to something else.
                                */
                               bpf_detachd(d);
                               BPFIF_DLIST_ENTRY_INIT(d);
                       }

                       bpf_attachd(d, bp);
               }
               reset_d(d);
               mutex_exit(d->bd_mtx);
               return (0);
       }
       /* Not found. */
       return (ENXIO);
}

/*
* Copy the interface name to the ifreq.
*/
static void
bpf_ifname(struct ifnet *ifp, struct ifreq *ifr)
{
       memcpy(ifr->ifr_name, ifp->if_xname, IFNAMSIZ);
}

static int
bpf_stat(struct file *fp, struct stat *st)
{
       struct bpf_d *d = fp->f_bpf;

       (void)memset(st, 0, sizeof(*st));
       mutex_enter(d->bd_mtx);
       st->st_dev = makedev(cdevsw_lookup_major(&bpf_cdevsw), d->bd_pid);
       st->st_atimespec = d->bd_atime;
       st->st_mtimespec = d->bd_mtime;
       st->st_ctimespec = st->st_birthtimespec = d->bd_btime;
       st->st_uid = kauth_cred_geteuid(fp->f_cred);
       st->st_gid = kauth_cred_getegid(fp->f_cred);
       st->st_mode = S_IFCHR;
       mutex_exit(d->bd_mtx);
       return 0;
}

/*
* Support for poll() system call
*
* Return true iff the specific operation will not block indefinitely - with
* the assumption that it is safe to positively acknowledge a request for the
* ability to write to the BPF device.
* Otherwise, return false but make a note that a selnotify() must be done.
*/
static int
bpf_poll(struct file *fp, int events)
{
       struct bpf_d *d = fp->f_bpf;
       int revents;

       /*
        * Refresh the PID associated with this bpf file.
        */
       mutex_enter(&bpf_mtx);
       d->bd_pid = curproc->p_pid;

       revents = events & (POLLOUT | POLLWRNORM);
       if (events & (POLLIN | POLLRDNORM)) {
               /*
                * An imitation of the FIONREAD ioctl code.
                */
               mutex_enter(d->bd_mtx);
               mutex_enter(d->bd_buf_mtx);
               if (d->bd_hlen != 0 ||
                   ((d->bd_immediate || d->bd_state == BPF_TIMED_OUT) &&
                       d->bd_slen != 0)) {
                       revents |= events & (POLLIN | POLLRDNORM);
               } else {
                       selrecord(curlwp, &d->bd_sel);
                       /* Start the read timeout if necessary */
                       if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) {
                               callout_reset(&d->bd_callout, d->bd_rtout,
                                   bpf_timed_out, d);
                               d->bd_state = BPF_WAITING;
                       }
               }
               mutex_exit(d->bd_buf_mtx);
               mutex_exit(d->bd_mtx);
       }

       mutex_exit(&bpf_mtx);
       return (revents);
}

static void
filt_bpfrdetach(struct knote *kn)
{
       struct bpf_d *d = kn->kn_hook;

       mutex_enter(d->bd_buf_mtx);
       selremove_knote(&d->bd_sel, kn);
       mutex_exit(d->bd_buf_mtx);
}

static int
filt_bpfread(struct knote *kn, long hint)
{
       struct bpf_d *d = kn->kn_hook;
       int rv;

       /*
        * Refresh the PID associated with this bpf file.
        */
       d->bd_pid = curproc->p_pid;

       if (hint & NOTE_SUBMIT)
               KASSERT(mutex_owned(d->bd_buf_mtx));
       else
               mutex_enter(d->bd_buf_mtx);
       kn->kn_data = d->bd_hlen;
       if (d->bd_immediate)
               kn->kn_data += d->bd_slen;
       rv = (kn->kn_data > 0);
       if (hint & NOTE_SUBMIT)
               KASSERT(mutex_owned(d->bd_buf_mtx));
       else
               mutex_exit(d->bd_buf_mtx);
       return rv;
}

static const struct filterops bpfread_filtops = {
       .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
       .f_attach = NULL,
       .f_detach = filt_bpfrdetach,
       .f_event = filt_bpfread,
};

static int
bpf_kqfilter(struct file *fp, struct knote *kn)
{
       struct bpf_d *d = fp->f_bpf;

       switch (kn->kn_filter) {
       case EVFILT_READ:
               kn->kn_fop = &bpfread_filtops;
               break;

       default:
               return (EINVAL);
       }

       kn->kn_hook = d;

       mutex_enter(d->bd_buf_mtx);
       selrecord_knote(&d->bd_sel, kn);
       mutex_exit(d->bd_buf_mtx);

       return (0);
}

/*
* Copy data from an mbuf chain into a buffer.  This code is derived
* from m_copydata in sys/uipc_mbuf.c.
*/
static void *
bpf_mcpy(void *dst_arg, const void *src_arg, size_t len)
{
       const struct mbuf *m;
       u_int count;
       u_char *dst;

       m = src_arg;
       dst = dst_arg;
       while (len > 0) {
               if (m == NULL)
                       panic("bpf_mcpy");
               count = uimin(m->m_len, len);
               memcpy(dst, mtod(m, const void *), count);
               m = m->m_next;
               dst += count;
               len -= count;
       }
       return dst_arg;
}

static inline u_int
bpf_xfilter(struct bpf_filter **filter, void *pkt, u_int pktlen, u_int buflen)
{
       struct bpf_filter *filt;
       uint32_t mem[BPF_MEMWORDS];
       bpf_args_t args = {
               .pkt = (const uint8_t *)pkt,
               .wirelen = pktlen,
               .buflen = buflen,
               .mem = mem,
               .arg = NULL
       };
       u_int slen;

       filt = atomic_load_consume(filter);
       if (filt == NULL) /* No filter means accept all. */
               return (u_int)-1;

       if (filt->bf_jitcode != NULL)
               slen = filt->bf_jitcode(NULL, &args);
       else
               slen = bpf_filter_ext(NULL, filt->bf_insn, &args);
       return slen;
}

/*
* Dispatch a packet to all the listeners on interface bp.
*
* pkt       pointer to the packet, either a data buffer or an mbuf chain
* buflen    buffer length, if pkt is a data buffer
* cpfn      a function that can copy pkt into the listener's buffer
* pktlen    length of the packet
* direction BPF_D_IN or BPF_D_OUT
*/
static inline void
bpf_deliver(struct bpf_if *bp, void *(*cpfn)(void *, const void *, size_t),
   void *pkt, u_int pktlen, u_int buflen, const u_int direction)
{
       bool gottime = false;
       struct timespec ts;
       struct bpf_d *d;
       int s;
       u_int slen;

       KASSERT(!cpu_intr_p());

       /*
        * Note that the IPL does not have to be raised at this point.
        * The only problem that could arise here is that if two different
        * interfaces shared any data.  This is not the case.
        */
       s = pserialize_read_enter();
       BPFIF_DLIST_READER_FOREACH(d, bp) {
               if (direction == BPF_D_IN) {
                       if (d->bd_direction == BPF_D_OUT)
                               continue;
               } else { /* BPF_D_OUT */
                       if (d->bd_direction == BPF_D_IN)
                               continue;
               }

               atomic_inc_ulong(&d->bd_rcount);
               BPF_STATINC(recv);

               slen = bpf_xfilter(&d->bd_rfilter, pkt, pktlen, buflen);
               if (slen == 0)
                       continue;

               if (!gottime) {
                       gottime = true;
                       nanotime(&ts);
               }
               /* Assume catchpacket doesn't sleep */
               catchpacket(d, pkt, pktlen, slen, cpfn, &ts);
       }
       pserialize_read_exit(s);
}

/*
* Incoming linkage from device drivers, when the head of the packet is in
* a buffer, and the tail is in an mbuf chain.
*/
static void
_bpf_mtap2(struct bpf_if *bp, void *data, u_int dlen, struct mbuf *m,
   u_int direction)
{
       u_int pktlen;
       struct mbuf mb;

       /* Skip outgoing duplicate packets. */
       if ((m->m_flags & M_PROMISC) != 0 && m->m_pkthdr.rcvif_index == 0) {
               m->m_flags &= ~M_PROMISC;
               return;
       }

       pktlen = m_length(m) + dlen;

       /*
        * Craft on-stack mbuf suitable for passing to bpf_filter.
        * Note that we cut corners here; we only set up what's
        * absolutely needed--this mbuf should never go anywhere else.
        */
       (void)memset(&mb, 0, sizeof(mb));
       mb.m_type = MT_DATA;
       mb.m_next = m;
       mb.m_data = data;
       mb.m_len = dlen;

       bpf_deliver(bp, bpf_mcpy, &mb, pktlen, 0, direction);
}

/*
* Incoming linkage from device drivers, when packet is in an mbuf chain.
*/
static void
_bpf_mtap(struct bpf_if *bp, struct mbuf *m, u_int direction)
{
       void *(*cpfn)(void *, const void *, size_t);
       u_int pktlen, buflen;
       void *marg;

       /* Skip outgoing duplicate packets. */
       if ((m->m_flags & M_PROMISC) != 0 && m->m_pkthdr.rcvif_index == 0) {
               m->m_flags &= ~M_PROMISC;
               return;
       }

       pktlen = m_length(m);

       /* Skip zero-sized packets. */
       if (__predict_false(pktlen == 0)) {
               return;
       }

       if (pktlen == m->m_len) {
               cpfn = (void *)memcpy;
               marg = mtod(m, void *);
               buflen = pktlen;
               KASSERT(buflen != 0);
       } else {
               cpfn = bpf_mcpy;
               marg = m;
               buflen = 0;
       }

       bpf_deliver(bp, cpfn, marg, pktlen, buflen, direction);
}

/*
* We need to prepend the address family as
* a four byte field.  Cons up a dummy header
* to pacify bpf.  This is safe because bpf
* will only read from the mbuf (i.e., it won't
* try to free it or keep a pointer a to it).
*/
static void
_bpf_mtap_af(struct bpf_if *bp, uint32_t af, struct mbuf *m, u_int direction)
{
       struct mbuf m0;

       m0.m_type = MT_DATA;
       m0.m_flags = 0;
       m0.m_next = m;
       m0.m_nextpkt = NULL;
       m0.m_owner = NULL;
       m0.m_len = 4;
       m0.m_data = (char *)&af;

       _bpf_mtap(bp, &m0, direction);
}

/*
* Put the SLIP pseudo-"link header" in place.
* Note this M_PREPEND() should never fail,
* since we know we always have enough space
* in the input buffer.
*/
static void
_bpf_mtap_sl_in(struct bpf_if *bp, u_char *chdr, struct mbuf **m)
{
       u_char *hp;

       M_PREPEND(*m, SLIP_HDRLEN, M_DONTWAIT);
       if (*m == NULL)
               return;

       hp = mtod(*m, u_char *);
       hp[SLX_DIR] = SLIPDIR_IN;
       (void)memcpy(&hp[SLX_CHDR], chdr, CHDR_LEN);

       _bpf_mtap(bp, *m, BPF_D_IN);

       m_adj(*m, SLIP_HDRLEN);
}

/*
* Put the SLIP pseudo-"link header" in
* place.  The compressed header is now
* at the beginning of the mbuf.
*/
static void
_bpf_mtap_sl_out(struct bpf_if *bp, u_char *chdr, struct mbuf *m)
{
       struct mbuf m0;
       u_char *hp;

       m0.m_type = MT_DATA;
       m0.m_flags = 0;
       m0.m_next = m;
       m0.m_nextpkt = NULL;
       m0.m_owner = NULL;
       m0.m_data = m0.m_dat;
       m0.m_len = SLIP_HDRLEN;

       hp = mtod(&m0, u_char *);

       hp[SLX_DIR] = SLIPDIR_OUT;
       (void)memcpy(&hp[SLX_CHDR], chdr, CHDR_LEN);

       _bpf_mtap(bp, &m0, BPF_D_OUT);
       m_freem(m);
}

static struct mbuf *
bpf_mbuf_enqueue(struct bpf_if *bp, struct mbuf *m)
{
       struct mbuf *dup;

       dup = m_dup(m, 0, M_COPYALL, M_NOWAIT);
       if (dup == NULL)
               return NULL;

       if (bp->bif_mbuf_tail != NULL) {
               bp->bif_mbuf_tail->m_nextpkt = dup;
       } else {
               bp->bif_mbuf_head = dup;
       }
       bp->bif_mbuf_tail = dup;
#ifdef BPF_MTAP_SOFTINT_DEBUG
       log(LOG_DEBUG, "%s: enqueued mbuf=%p to %s\n",
           __func__, dup, bp->bif_ifp->if_xname);
#endif

       return dup;
}

static struct mbuf *
bpf_mbuf_dequeue(struct bpf_if *bp)
{
       struct mbuf *m;
       int s;

       /* XXX NOMPSAFE: assumed running on one CPU */
       s = splnet();
       m = bp->bif_mbuf_head;
       if (m != NULL) {
               bp->bif_mbuf_head = m->m_nextpkt;
               m->m_nextpkt = NULL;

               if (bp->bif_mbuf_head == NULL)
                       bp->bif_mbuf_tail = NULL;
#ifdef BPF_MTAP_SOFTINT_DEBUG
               log(LOG_DEBUG, "%s: dequeued mbuf=%p from %s\n",
                   __func__, m, bp->bif_ifp->if_xname);
#endif
       }
       splx(s);

       return m;
}

static void
bpf_mtap_si(void *arg)
{
       struct bpf_if *bp = arg;
       struct mbuf *m;

       while ((m = bpf_mbuf_dequeue(bp)) != NULL) {
#ifdef BPF_MTAP_SOFTINT_DEBUG
               log(LOG_DEBUG, "%s: tapping mbuf=%p on %s\n",
                   __func__, m, bp->bif_ifp->if_xname);
#endif
               bpf_ops->bpf_mtap(bp, m, BPF_D_IN);
               m_freem(m);
       }
}

static void
_bpf_mtap_softint(struct ifnet *ifp, struct mbuf *m)
{
       struct bpf_if *bp = ifp->if_bpf;
       struct mbuf *dup;

       KASSERT(cpu_intr_p());

       /* To avoid extra invocations of the softint */
       if (BPFIF_DLIST_READER_EMPTY(bp))
               return;
       KASSERT(bp->bif_si != NULL);

       dup = bpf_mbuf_enqueue(bp, m);
       if (dup != NULL)
               softint_schedule(bp->bif_si);
}

static int
bpf_hdrlen(struct bpf_d *d)
{
       int hdrlen = d->bd_bif->bif_hdrlen;
       /*
        * Compute the length of the bpf header.  This is not necessarily
        * equal to SIZEOF_BPF_HDR because we want to insert spacing such
        * that the network layer header begins on a longword boundary (for
        * performance reasons and to alleviate alignment restrictions).
        */
#ifdef _LP64
       if (d->bd_compat32)
               return (BPF_WORDALIGN32(hdrlen + SIZEOF_BPF_HDR32) - hdrlen);
       else
#endif
               return (BPF_WORDALIGN(hdrlen + SIZEOF_BPF_HDR) - hdrlen);
}

/*
* Move the packet data from interface memory (pkt) into the
* store buffer. Call the wakeup functions if it's time to wake up
* a listener (buffer full), "cpfn" is the routine called to do the
* actual data transfer. memcpy is passed in to copy contiguous chunks,
* while bpf_mcpy is passed in to copy mbuf chains.  In the latter case,
* pkt is really an mbuf.
*/
static void
catchpacket(struct bpf_d *d, u_char *pkt, u_int pktlen, u_int snaplen,
   void *(*cpfn)(void *, const void *, size_t), struct timespec *ts)
{
       char *h;
       int totlen, curlen, caplen;
       int hdrlen = bpf_hdrlen(d);
       int do_wakeup = 0;

       atomic_inc_ulong(&d->bd_ccount);
       BPF_STATINC(capt);
       /*
        * Figure out how many bytes to move.  If the packet is
        * greater or equal to the snapshot length, transfer that
        * much.  Otherwise, transfer the whole packet (unless
        * we hit the buffer size limit).
        */
       totlen = hdrlen + uimin(snaplen, pktlen);
       if (totlen > d->bd_bufsize)
               totlen = d->bd_bufsize;
       /*
        * If we adjusted totlen to fit the bufsize, it could be that
        * totlen is smaller than hdrlen because of the link layer header.
        */
       caplen = totlen - hdrlen;
       if (caplen < 0)
               caplen = 0;

       mutex_enter(d->bd_buf_mtx);
       /*
        * Round up the end of the previous packet to the next longword.
        */
#ifdef _LP64
       if (d->bd_compat32)
               curlen = BPF_WORDALIGN32(d->bd_slen);
       else
#endif
               curlen = BPF_WORDALIGN(d->bd_slen);
       if (curlen + totlen > d->bd_bufsize) {
               /*
                * This packet will overflow the storage buffer.
                * Rotate the buffers if we can, then wakeup any
                * pending reads.
                */
               if (d->bd_fbuf == NULL) {
                       mutex_exit(d->bd_buf_mtx);
                       /*
                        * We haven't completed the previous read yet,
                        * so drop the packet.
                        */
                       atomic_inc_ulong(&d->bd_dcount);
                       BPF_STATINC(drop);
                       return;
               }
               ROTATE_BUFFERS(d);
               do_wakeup = 1;
               curlen = 0;
       } else if (d->bd_immediate || d->bd_state == BPF_TIMED_OUT) {
               /*
                * Immediate mode is set, or the read timeout has
                * already expired during a select call.  A packet
                * arrived, so the reader should be woken up.
                */
               do_wakeup = 1;
       }

       /*
        * Append the bpf header.
        */
       h = (char *)d->bd_sbuf + curlen;
#ifdef _LP64
       if (d->bd_compat32) {
               struct bpf_hdr32 *hp32;

               hp32 = (struct bpf_hdr32 *)h;
               hp32->bh_tstamp.tv_sec = ts->tv_sec;
               hp32->bh_tstamp.tv_usec = ts->tv_nsec / 1000;
               hp32->bh_datalen = pktlen;
               hp32->bh_hdrlen = hdrlen;
               hp32->bh_caplen = caplen;
       } else
#endif
       {
               struct bpf_hdr *hp;

               hp = (struct bpf_hdr *)h;
               hp->bh_tstamp.tv_sec = ts->tv_sec;
               hp->bh_tstamp.tv_usec = ts->tv_nsec / 1000;
               hp->bh_datalen = pktlen;
               hp->bh_hdrlen = hdrlen;
               hp->bh_caplen = caplen;
       }

       /*
        * Copy the packet data into the store buffer and update its length.
        */
       (*cpfn)(h + hdrlen, pkt, caplen);
       d->bd_slen = curlen + totlen;

       /*
        * Call bpf_wakeup after bd_slen has been updated so that kevent(2)
        * will cause filt_bpfread() to be called with it adjusted.
        */
       if (do_wakeup)
               bpf_wakeup(d);

       mutex_exit(d->bd_buf_mtx);
}

/*
* Initialize all nonzero fields of a descriptor.
*/
static int
bpf_allocbufs(struct bpf_d *d)
{

       d->bd_fbuf = kmem_zalloc(d->bd_bufsize, KM_NOSLEEP);
       if (!d->bd_fbuf)
               return (ENOBUFS);
       d->bd_sbuf = kmem_zalloc(d->bd_bufsize, KM_NOSLEEP);
       if (!d->bd_sbuf) {
               kmem_free(d->bd_fbuf, d->bd_bufsize);
               return (ENOBUFS);
       }
       d->bd_slen = 0;
       d->bd_hlen = 0;
       return (0);
}

static void
bpf_free_filter(struct bpf_filter *filter)
{

       KASSERT(filter != NULL);

       if (filter->bf_insn != NULL)
               kmem_free(filter->bf_insn, filter->bf_size);
       if (filter->bf_jitcode != NULL)
               bpf_jit_freecode(filter->bf_jitcode);
       kmem_free(filter, sizeof(*filter));
}

/*
* Free buffers currently in use by a descriptor.
* Called on close.
*/
static void
bpf_freed(struct bpf_d *d)
{
       /*
        * We don't need to lock out interrupts since this descriptor has
        * been detached from its interface and it yet hasn't been marked
        * free.
        */
       if (d->bd_sbuf != NULL) {
               kmem_free(d->bd_sbuf, d->bd_bufsize);
               if (d->bd_hbuf != NULL)
                       kmem_free(d->bd_hbuf, d->bd_bufsize);
               if (d->bd_fbuf != NULL)
                       kmem_free(d->bd_fbuf, d->bd_bufsize);
       }
       if (d->bd_rfilter != NULL) {
               bpf_free_filter(d->bd_rfilter);
               d->bd_rfilter = NULL;
       }
       if (d->bd_wfilter != NULL) {
               bpf_free_filter(d->bd_wfilter);
               d->bd_wfilter = NULL;
       }
       d->bd_jitcode = NULL;
}

/*
* Attach an interface to bpf.  dlt is the link layer type;
* hdrlen is the fixed size of the link header for the specified dlt
* (variable length headers not yet supported).
*/
static void
_bpfattach(struct ifnet *ifp, u_int dlt, u_int hdrlen, struct bpf_if **driverp)
{
       struct bpf_if *bp;

       bp = kmem_alloc(sizeof(*bp), KM_SLEEP);

       mutex_enter(&bpf_mtx);
       bp->bif_driverp = driverp;
       bp->bif_ifp = ifp;
       bp->bif_dlt = dlt;
       bp->bif_si = NULL;
       BPF_IFLIST_ENTRY_INIT(bp);
       PSLIST_INIT(&bp->bif_dlist_head);
       psref_target_init(&bp->bif_psref, bpf_psref_class);
       SLIST_INIT(&bp->bif_trackers);

       BPF_IFLIST_WRITER_INSERT_HEAD(bp);

       *bp->bif_driverp = NULL;

       bp->bif_hdrlen = hdrlen;
       mutex_exit(&bpf_mtx);
#if 0
       printf("bpf: %s attached with dlt %x\n", ifp->if_xname, dlt);
#endif
}

static void
_bpf_mtap_softint_init(struct ifnet *ifp)
{
       struct bpf_if *bp;

       mutex_enter(&bpf_mtx);
       BPF_IFLIST_WRITER_FOREACH(bp) {
               if (bp->bif_ifp != ifp)
                       continue;

               bp->bif_mbuf_head = NULL;
               bp->bif_mbuf_tail = NULL;
               bp->bif_si = softint_establish(SOFTINT_NET, bpf_mtap_si, bp);
               if (bp->bif_si == NULL)
                       panic("%s: softint_establish() failed", __func__);
               break;
       }
       mutex_exit(&bpf_mtx);

       if (bp == NULL)
               panic("%s: no bpf_if found for %s", __func__, ifp->if_xname);
}

/*
* Remove an interface from bpf.
*/
static void
_bpfdetach(struct ifnet *ifp)
{
       struct bpf_if *bp;
       struct bpf_d *d;
       int s;

       mutex_enter(&bpf_mtx);
       /* Nuke the vnodes for any open instances */
again_d:
       BPF_DLIST_WRITER_FOREACH(d) {
               mutex_enter(d->bd_mtx);
               if (d->bd_bif != NULL && d->bd_bif->bif_ifp == ifp) {
                       /*
                        * Detach the descriptor from an interface now.
                        * It will be free'ed later by close routine.
                        */
                       bpf_detachd(d);
                       mutex_exit(d->bd_mtx);
                       goto again_d;
               }
               mutex_exit(d->bd_mtx);
       }

again:
       BPF_IFLIST_WRITER_FOREACH(bp) {
               if (bp->bif_ifp == ifp) {
                       BPF_IFLIST_WRITER_REMOVE(bp);

                       pserialize_perform(bpf_psz);
                       psref_target_destroy(&bp->bif_psref, bpf_psref_class);

                       while (!SLIST_EMPTY(&bp->bif_trackers)) {
                               struct bpf_event_tracker *t =
                                   SLIST_FIRST(&bp->bif_trackers);
                               SLIST_REMOVE_HEAD(&bp->bif_trackers,
                                   bet_entries);
                               kmem_free(t, sizeof(*t));
                       }

                       BPF_IFLIST_ENTRY_DESTROY(bp);
                       if (bp->bif_si != NULL) {
                               /* XXX NOMPSAFE: assumed running on one CPU */
                               s = splnet();
                               while (bp->bif_mbuf_head != NULL) {
                                       struct mbuf *m = bp->bif_mbuf_head;
                                       bp->bif_mbuf_head = m->m_nextpkt;
                                       m_freem(m);
                               }
                               splx(s);
                               softint_disestablish(bp->bif_si);
                       }
                       kmem_free(bp, sizeof(*bp));
                       goto again;
               }
       }
       mutex_exit(&bpf_mtx);
}

/*
* Change the data link type of a interface.
*/
static void
_bpf_change_type(struct ifnet *ifp, u_int dlt, u_int hdrlen)
{
       struct bpf_if *bp;

       mutex_enter(&bpf_mtx);
       BPF_IFLIST_WRITER_FOREACH(bp) {
               if (bp->bif_driverp == &ifp->if_bpf)
                       break;
       }
       if (bp == NULL)
               panic("bpf_change_type");

       bp->bif_dlt = dlt;

       bp->bif_hdrlen = hdrlen;
       mutex_exit(&bpf_mtx);
}

/*
* Get a list of available data link type of the interface.
*/
static int
bpf_getdltlist(struct bpf_d *d, struct bpf_dltlist *bfl)
{
       int n, error;
       struct ifnet *ifp;
       struct bpf_if *bp;
       int s, bound;

       KASSERT(mutex_owned(d->bd_mtx));

       ifp = d->bd_bif->bif_ifp;
       n = 0;
       error = 0;

       bound = curlwp_bind();
       s = pserialize_read_enter();
       BPF_IFLIST_READER_FOREACH(bp) {
               if (bp->bif_ifp != ifp)
                       continue;
               if (bfl->bfl_list != NULL) {
                       struct psref psref;

                       if (n >= bfl->bfl_len) {
                               pserialize_read_exit(s);
                               return ENOMEM;
                       }

                       bpf_if_acquire(bp, &psref);
                       pserialize_read_exit(s);

                       error = copyout(&bp->bif_dlt,
                           bfl->bfl_list + n, sizeof(u_int));

                       s = pserialize_read_enter();
                       bpf_if_release(bp, &psref);
               }
               n++;
       }
       pserialize_read_exit(s);
       curlwp_bindx(bound);

       bfl->bfl_len = n;
       return error;
}

/*
* Set the data link type of a BPF instance.
*/
static int
bpf_setdlt(struct bpf_d *d, u_int dlt)
{
       int error, opromisc;
       struct ifnet *ifp;
       struct bpf_if *bp;

       KASSERT(mutex_owned(&bpf_mtx));
       KASSERT(mutex_owned(d->bd_mtx));

       if (d->bd_bif->bif_dlt == dlt)
               return 0;
       ifp = d->bd_bif->bif_ifp;
       BPF_IFLIST_WRITER_FOREACH(bp) {
               if (bp->bif_ifp == ifp && bp->bif_dlt == dlt)
                       break;
       }
       if (bp == NULL)
               return EINVAL;
       opromisc = d->bd_promisc;
       bpf_detachd(d);
       BPFIF_DLIST_ENTRY_INIT(d);
       bpf_attachd(d, bp);
       reset_d(d);
       if (opromisc) {
               KERNEL_LOCK_UNLESS_NET_MPSAFE();
               error = ifpromisc(bp->bif_ifp, 1);
               KERNEL_UNLOCK_UNLESS_NET_MPSAFE();
               if (error)
                       printf("%s: bpf_setdlt: ifpromisc failed (%d)\n",
                           bp->bif_ifp->if_xname, error);
               else
                       d->bd_promisc = 1;
       }
       return 0;
}

static int
sysctl_net_bpf_maxbufsize(SYSCTLFN_ARGS)
{
       int newsize, error;
       struct sysctlnode node;

       node = *rnode;
       node.sysctl_data = &newsize;
       newsize = bpf_maxbufsize;
       error = sysctl_lookup(SYSCTLFN_CALL(&node));
       if (error || newp == NULL)
               return (error);

       if (newsize < BPF_MINBUFSIZE || newsize > BPF_MAXBUFSIZE)
               return (EINVAL);

       bpf_maxbufsize = newsize;

       return (0);
}

#if defined(MODULAR) || defined(BPFJIT)
static int
sysctl_net_bpf_jit(SYSCTLFN_ARGS)
{
       bool newval;
       int error;
       struct sysctlnode node;

       node = *rnode;
       node.sysctl_data = &newval;
       newval = bpf_jit;
       error = sysctl_lookup(SYSCTLFN_CALL(&node));
       if (error != 0 || newp == NULL)
               return error;

       bpf_jit = newval;
       if (newval && bpfjit_module_ops.bj_generate_code == NULL) {
               printf("JIT compilation is postponed "
                   "until after bpfjit module is loaded\n");
       }

       return 0;
}
#endif

static int
sysctl_net_bpf_peers(SYSCTLFN_ARGS)
{
       int    error, elem_count;
       struct bpf_d     *dp;
       struct bpf_d_ext  dpe;
       size_t len, needed, elem_size, out_size;
       char   *sp;

       if (namelen == 1 && name[0] == CTL_QUERY)
               return (sysctl_query(SYSCTLFN_CALL(rnode)));

       if (namelen != 2)
               return (EINVAL);

       /* BPF peers is privileged information. */
       error = kauth_authorize_network(l->l_cred, KAUTH_NETWORK_INTERFACE,
           KAUTH_REQ_NETWORK_INTERFACE_GETPRIV, NULL, NULL, NULL);
       if (error)
               return (EPERM);

       len = (oldp != NULL) ? *oldlenp : 0;
       sp = oldp;
       elem_size = name[0];
       elem_count = name[1];
       out_size = MIN(sizeof(dpe), elem_size);
       needed = 0;

       if (elem_size < 1 || elem_count < 0)
               return (EINVAL);

       mutex_enter(&bpf_mtx);
       BPF_DLIST_WRITER_FOREACH(dp) {
               if (len >= elem_size && elem_count > 0) {
#define BPF_EXT(field)  dpe.bde_ ## field = dp->bd_ ## field
                       BPF_EXT(bufsize);
                       BPF_EXT(promisc);
                       BPF_EXT(state);
                       BPF_EXT(immediate);
                       BPF_EXT(hdrcmplt);
                       BPF_EXT(direction);
                       BPF_EXT(pid);
                       BPF_EXT(rcount);
                       BPF_EXT(dcount);
                       BPF_EXT(ccount);
#undef BPF_EXT
                       mutex_enter(dp->bd_mtx);
                       if (dp->bd_bif)
                               (void)strlcpy(dpe.bde_ifname,
                                   dp->bd_bif->bif_ifp->if_xname,
                                   IFNAMSIZ - 1);
                       else
                               dpe.bde_ifname[0] = '\0';
                       dpe.bde_locked = dp->bd_locked;
                       mutex_exit(dp->bd_mtx);

                       error = copyout(&dpe, sp, out_size);
                       if (error)
                               break;
                       sp += elem_size;
                       len -= elem_size;
               }
               needed += elem_size;
               if (elem_count > 0 && elem_count != INT_MAX)
                       elem_count--;
       }
       mutex_exit(&bpf_mtx);

       *oldlenp = needed;

       return (error);
}

static void
bpf_stats(void *p, void *arg, struct cpu_info *ci __unused)
{
       struct bpf_stat *const stats = p;
       struct bpf_stat *sum = arg;

       int s = splnet();

       sum->bs_recv += stats->bs_recv;
       sum->bs_drop += stats->bs_drop;
       sum->bs_capt += stats->bs_capt;

       splx(s);
}

static int
bpf_sysctl_gstats_handler(SYSCTLFN_ARGS)
{
       struct sysctlnode node;
       int error;
       struct bpf_stat sum;

       memset(&sum, 0, sizeof(sum));
       node = *rnode;

       percpu_foreach_xcall(bpf_gstats_percpu, XC_HIGHPRI_IPL(IPL_SOFTNET),
           bpf_stats, &sum);

       node.sysctl_data = &sum;
       node.sysctl_size = sizeof(sum);
       error = sysctl_lookup(SYSCTLFN_CALL(&node));
       if (error != 0 || newp == NULL)
               return error;

       return 0;
}

SYSCTL_SETUP(sysctl_net_bpf_setup, "bpf sysctls")
{
       const struct sysctlnode *node;

       node = NULL;
       sysctl_createv(clog, 0, NULL, &node,
           CTLFLAG_PERMANENT,
           CTLTYPE_NODE, "bpf",
           SYSCTL_DESCR("BPF options"),
           NULL, 0, NULL, 0,
           CTL_NET, CTL_CREATE, CTL_EOL);
       if (node != NULL) {
#if defined(MODULAR) || defined(BPFJIT)
               sysctl_createv(clog, 0, NULL, NULL,
                   CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                   CTLTYPE_BOOL, "jit",
                   SYSCTL_DESCR("Toggle Just-In-Time compilation"),
                   sysctl_net_bpf_jit, 0, &bpf_jit, 0,
                   CTL_NET, node->sysctl_num, CTL_CREATE, CTL_EOL);
#endif
               sysctl_createv(clog, 0, NULL, NULL,
                   CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                   CTLTYPE_INT, "maxbufsize",
                   SYSCTL_DESCR("Maximum size for data capture buffer"),
                   sysctl_net_bpf_maxbufsize, 0, &bpf_maxbufsize, 0,
                   CTL_NET, node->sysctl_num, CTL_CREATE, CTL_EOL);
               sysctl_createv(clog, 0, NULL, NULL,
                   CTLFLAG_PERMANENT,
                   CTLTYPE_STRUCT, "stats",
                   SYSCTL_DESCR("BPF stats"),
                   bpf_sysctl_gstats_handler, 0, NULL, 0,
                   CTL_NET, node->sysctl_num, CTL_CREATE, CTL_EOL);
               sysctl_createv(clog, 0, NULL, NULL,
                   CTLFLAG_PERMANENT,
                   CTLTYPE_STRUCT, "peers",
                   SYSCTL_DESCR("BPF peers"),
                   sysctl_net_bpf_peers, 0, NULL, 0,
                   CTL_NET, node->sysctl_num, CTL_CREATE, CTL_EOL);
       }

}

static int
_bpf_register_track_event(struct bpf_if **driverp,
   void (*_fun)(struct bpf_if *, struct ifnet *, int, int))
{
       struct bpf_if *bp;
       struct bpf_event_tracker *t;
       int ret = ENOENT;

       t = kmem_zalloc(sizeof(*t), KM_SLEEP);
       if (!t)
               return ENOMEM;
       t->bet_notify = _fun;

       mutex_enter(&bpf_mtx);
       BPF_IFLIST_WRITER_FOREACH(bp) {
               if (bp->bif_driverp != driverp)
                       continue;
               SLIST_INSERT_HEAD(&bp->bif_trackers, t, bet_entries);
               ret = 0;
               break;
       }
       mutex_exit(&bpf_mtx);

       return ret;
}

static int
_bpf_deregister_track_event(struct bpf_if **driverp,
   void (*_fun)(struct bpf_if *, struct ifnet *, int, int))
{
       struct bpf_if *bp;
       struct bpf_event_tracker *t = NULL;
       int ret = ENOENT;

       mutex_enter(&bpf_mtx);
       BPF_IFLIST_WRITER_FOREACH(bp) {
               if (bp->bif_driverp != driverp)
                       continue;
               SLIST_FOREACH(t, &bp->bif_trackers, bet_entries) {
                       if (t->bet_notify == _fun) {
                               ret = 0;
                               break;
                       }
               }
               if (ret == 0)
                       break;
       }
       if (ret == 0 && t && t->bet_notify == _fun) {
               SLIST_REMOVE(&bp->bif_trackers, t, bpf_event_tracker,
                   bet_entries);
       }
       mutex_exit(&bpf_mtx);
       if (ret == 0)
               kmem_free(t, sizeof(*t));
       return ret;
}

struct bpf_ops bpf_ops_kernel = {
       .bpf_attach =           _bpfattach,
       .bpf_detach =           _bpfdetach,
       .bpf_change_type =      _bpf_change_type,
       .bpf_register_track_event = _bpf_register_track_event,
       .bpf_deregister_track_event = _bpf_deregister_track_event,

       .bpf_mtap =             _bpf_mtap,
       .bpf_mtap2 =            _bpf_mtap2,
       .bpf_mtap_af =          _bpf_mtap_af,
       .bpf_mtap_sl_in =       _bpf_mtap_sl_in,
       .bpf_mtap_sl_out =      _bpf_mtap_sl_out,

       .bpf_mtap_softint =             _bpf_mtap_softint,
       .bpf_mtap_softint_init =        _bpf_mtap_softint_init,
};

MODULE(MODULE_CLASS_DRIVER, bpf, "bpf_filter");

static int
bpf_modcmd(modcmd_t cmd, void *arg)
{
#ifdef _MODULE
       devmajor_t bmajor, cmajor;
#endif
       int error = 0;

       switch (cmd) {
       case MODULE_CMD_INIT:
               bpf_init();
#ifdef _MODULE
               bmajor = cmajor = NODEVMAJOR;
               error = devsw_attach("bpf", NULL, &bmajor,
                   &bpf_cdevsw, &cmajor);
               if (error)
                       break;
#endif

               bpf_ops_handover_enter(&bpf_ops_kernel);
               atomic_swap_ptr(&bpf_ops, &bpf_ops_kernel);
               bpf_ops_handover_exit();
               break;

       case MODULE_CMD_FINI:
               /*
                * While there is no reference counting for bpf callers,
                * unload could at least in theory be done similarly to
                * system call disestablishment.  This should even be
                * a little simpler:
                *
                * 1) replace op vector with stubs
                * 2) post update to all cpus with xc
                * 3) check that nobody is in bpf anymore
                *    (it's doubtful we'd want something like l_sysent,
                *     but we could do something like *signed* percpu
                *     counters.  if the sum is 0, we're good).
                * 4) if fail, unroll changes
                *
                * NOTE: change won't be atomic to the outside.  some
                * packets may be not captured even if unload is
                * not successful.  I think packet capture not working
                * is a perfectly logical consequence of trying to
                * disable packet capture.
                */
               error = EOPNOTSUPP;
               break;

       default:
               error = ENOTTY;
               break;
       }

       return error;
}