/* $NetBSD: uvm_pgflcache.c,v 1.6 2020/10/18 18:31:31 chs Exp $ */

/* $NetBSD: uvm_pgflcache.c,v 1.6 2020/10/18 18:31:31 chs Exp $ */

/*-
* Copyright (c) 2019 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/

/*
* uvm_pgflcache.c: page freelist cache.
*
* This implements a tiny per-CPU cache of pages that sits between the main
* page allocator and the freelists. By allocating and freeing pages in
* batch, it reduces freelist contention by an order of magnitude.
*
* The cache can be paused & resumed at runtime so that UVM_HOTPLUG,
* uvm_pglistalloc() and uvm_page_redim() can have a consistent view of the
* world. On system with one CPU per physical package (e.g. a uniprocessor)
* the cache is not enabled.
*/

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uvm_pgflcache.c,v 1.6 2020/10/18 18:31:31 chs Exp $");

#include "opt_uvm.h"
#include "opt_multiprocessor.h"

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sched.h>
#include <sys/kernel.h>
#include <sys/vnode.h>
#include <sys/proc.h>
#include <sys/atomic.h>
#include <sys/cpu.h>
#include <sys/xcall.h>

#include <uvm/uvm.h>
#include <uvm/uvm_pglist.h>
#include <uvm/uvm_pgflcache.h>

/* There is no point doing any of this on a uniprocessor. */
#ifdef MULTIPROCESSOR

/*
* MAXPGS - maximum pages per color, per bucket.
* FILLPGS - number of pages to allocate at once, per color, per bucket.
*
* Why the chosen values:
*
* (1) In 2019, an average Intel system has 4kB pages and 8x L2 cache
* colors. We make the assumption that most of the time allocation activity
* will be centered around one UVM freelist, so most of the time there will
* be no more than 224kB worth of cached pages per-CPU. That's tiny, but
* enough to hugely reduce contention on the freelist locks, and give us a
* small pool of pages which if we're very lucky may have some L1/L2 cache
* locality, and do so without subtracting too much from the L2/L3 cache
* benefits of having per-package free lists in the page allocator.
*
* (2) With the chosen values on _LP64, the data structure for each color
* takes up a single cache line (64 bytes) giving this very low overhead
* even in the "miss" case.
*
* (3) We don't want to cause too much pressure by hiding away memory that
* could otherwise be put to good use.
*/
#define MAXPGS 7
#define FILLPGS 6

/* Variable size, according to # colors. */
struct pgflcache {
struct pccolor {
intptr_t count;
struct vm_page *pages[MAXPGS];
} color[1];
};

static kmutex_t uvm_pgflcache_lock;
static int uvm_pgflcache_sem;

/*
* uvm_pgflcache_fill: fill specified freelist/color from global list
*
* => must be called at IPL_VM
* => must be called with given bucket lock held
* => must only fill from the correct bucket for this CPU
*/

void
uvm_pgflcache_fill(struct uvm_cpu *ucpu, int fl, int b, int c)
{
struct pgflbucket *pgb;
struct pgflcache *pc;
struct pccolor *pcc;
struct pgflist *head;
struct vm_page *pg;
int count;

KASSERT(mutex_owned(&uvm_freelist_locks[b].lock));
KASSERT(ucpu->pgflbucket == b);

/* If caching is off, then bail out. */
if (__predict_false((pc = ucpu->pgflcache[fl]) == NULL)) {
return;
}

/* Fill only to the limit. */
pcc = &pc->color[c];
pgb = uvm.page_free[fl].pgfl_buckets[b];
head = &pgb->pgb_colors[c];
if (pcc->count >= FILLPGS) {
return;
}

/* Pull pages from the bucket until it's empty, or we are full. */
count = pcc->count;
pg = LIST_FIRST(head);
while (__predict_true(pg != NULL && count < FILLPGS)) {
KASSERT(pg->flags & PG_FREE);
KASSERT(uvm_page_get_bucket(pg) == b);
pcc->pages[count++] = pg;
pg = LIST_NEXT(pg, pageq.list);
}

/* Violate LIST abstraction to remove all pages at once. */
head->lh_first = pg;
if (__predict_true(pg != NULL)) {
pg->pageq.list.le_prev = &head->lh_first;
}
pgb->pgb_nfree -= (count - pcc->count);
CPU_COUNT(CPU_COUNT_FREEPAGES, -(count - pcc->count));
pcc->count = count;
}

/*
* uvm_pgflcache_spill: spill specified freelist/color to global list
*
* => must be called at IPL_VM
* => mark __noinline so we don't pull it into uvm_pgflcache_free()
*/

static void __noinline
uvm_pgflcache_spill(struct uvm_cpu *ucpu, int fl, int c)
{
struct pgflbucket *pgb;
struct pgfreelist *pgfl;
struct pgflcache *pc;
struct pccolor *pcc;
struct pgflist *head;
kmutex_t *lock;
int b, adj;

pc = ucpu->pgflcache[fl];
pcc = &pc->color[c];
pgfl = &uvm.page_free[fl];
b = ucpu->pgflbucket;
pgb = pgfl->pgfl_buckets[b];
head = &pgb->pgb_colors[c];
lock = &uvm_freelist_locks[b].lock;

mutex_spin_enter(lock);
for (adj = pcc->count; pcc->count != 0;) {
pcc->count--;
KASSERT(pcc->pages[pcc->count] != NULL);
KASSERT(pcc->pages[pcc->count]->flags & PG_FREE);
LIST_INSERT_HEAD(head, pcc->pages[pcc->count], pageq.list);
}
pgb->pgb_nfree += adj;
CPU_COUNT(CPU_COUNT_FREEPAGES, adj);
mutex_spin_exit(lock);
}

/*
* uvm_pgflcache_alloc: try to allocate a cached page.
*
* => must be called at IPL_VM
* => allocate only from the given freelist and given page color
*/

struct vm_page *
uvm_pgflcache_alloc(struct uvm_cpu *ucpu, int fl, int c)
{
struct pgflcache *pc;
struct pccolor *pcc;
struct vm_page *pg;

/* If caching is off, then bail out. */
if (__predict_false((pc = ucpu->pgflcache[fl]) == NULL)) {
return NULL;
}

/* Very simple: if we have a page then return it. */
pcc = &pc->color[c];
if (__predict_false(pcc->count == 0)) {
return NULL;
}
pg = pcc->pages[--(pcc->count)];
KASSERT(pg != NULL);
KASSERT(pg->flags == PG_FREE);
KASSERT(uvm_page_get_freelist(pg) == fl);
KASSERT(uvm_page_get_bucket(pg) == ucpu->pgflbucket);
pg->flags = PG_BUSY | PG_CLEAN | PG_FAKE;
return pg;
}

/*
* uvm_pgflcache_free: cache a page, if possible.
*
* => must be called at IPL_VM
* => must only send pages for the correct bucket for this CPU
*/

bool
uvm_pgflcache_free(struct uvm_cpu *ucpu, struct vm_page *pg)
{
struct pgflcache *pc;
struct pccolor *pcc;
int fl, c;

KASSERT(uvm_page_get_bucket(pg) == ucpu->pgflbucket);

/* If caching is off, then bail out. */
fl = uvm_page_get_freelist(pg);
if (__predict_false((pc = ucpu->pgflcache[fl]) == NULL)) {
return false;
}

/* If the array is full spill it first, then add page to array. */
c = VM_PGCOLOR(pg);
pcc = &pc->color[c];
KASSERT((pg->flags & PG_FREE) == 0);
if (__predict_false(pcc->count == MAXPGS)) {
uvm_pgflcache_spill(ucpu, fl, c);
}
pg->flags = PG_FREE;
pcc->pages[pcc->count] = pg;
pcc->count++;
return true;
}

/*
* uvm_pgflcache_init: allocate and initialize per-CPU data structures for
* the free page cache. Don't set anything in motion - that's taken care
* of by uvm_pgflcache_resume().
*/

static void
uvm_pgflcache_init_cpu(struct cpu_info *ci)
{
struct uvm_cpu *ucpu;
size_t sz;

ucpu = ci->ci_data.cpu_uvm;
KASSERT(ucpu->pgflcachemem == NULL);
KASSERT(ucpu->pgflcache[0] == NULL);

sz = offsetof(struct pgflcache, color[uvmexp.ncolors]);
ucpu->pgflcachememsz =
(roundup2(sz * VM_NFREELIST, coherency_unit) + coherency_unit - 1);
ucpu->pgflcachemem = kmem_zalloc(ucpu->pgflcachememsz, KM_SLEEP);
}

/*
* uvm_pgflcache_fini_cpu: dump all cached pages back to global free list
* and shut down caching on the CPU. Called on each CPU in the system via
* xcall.
*/

static void
uvm_pgflcache_fini_cpu(void *arg1 __unused, void *arg2 __unused)
{
struct uvm_cpu *ucpu;
int fl, color, s;

ucpu = curcpu()->ci_data.cpu_uvm;
for (fl = 0; fl < VM_NFREELIST; fl++) {
s = splvm();
for (color = 0; color < uvmexp.ncolors; color++) {
uvm_pgflcache_spill(ucpu, fl, color);
}
ucpu->pgflcache[fl] = NULL;
splx(s);
}
}

/*
* uvm_pgflcache_pause: pause operation of the caches
*/

void
uvm_pgflcache_pause(void)
{
uint64_t where;

/* First one in starts draining. Everyone else waits. */
mutex_enter(&uvm_pgflcache_lock);
if (uvm_pgflcache_sem++ == 0) {
where = xc_broadcast(XC_HIGHPRI, uvm_pgflcache_fini_cpu,
(void *)1, NULL);
xc_wait(where);
}
mutex_exit(&uvm_pgflcache_lock);
}

/*
* uvm_pgflcache_resume: resume operation of the caches
*/

void
uvm_pgflcache_resume(void)
{
CPU_INFO_ITERATOR cii;
struct cpu_info *ci;
struct uvm_cpu *ucpu;
uintptr_t addr;
size_t sz;
int fl;

/* Last guy out takes care of business. */
mutex_enter(&uvm_pgflcache_lock);
KASSERT(uvm_pgflcache_sem > 0);
if (uvm_pgflcache_sem-- > 1) {
mutex_exit(&uvm_pgflcache_lock);
return;
}

/*
* Make sure dependant data structure updates are remotely visible.
* Essentially this functions as a global memory barrier.
*/
xc_barrier(XC_HIGHPRI);

/*
* Then set all of the pointers in place on each CPU. As soon as
* each pointer is set, caching is operational in that dimension.
*/
sz = offsetof(struct pgflcache, color[uvmexp.ncolors]);
for (CPU_INFO_FOREACH(cii, ci)) {
ucpu = ci->ci_data.cpu_uvm;
addr = roundup2((uintptr_t)ucpu->pgflcachemem, coherency_unit);
for (fl = 0; fl < VM_NFREELIST; fl++) {
ucpu->pgflcache[fl] = (struct pgflcache *)addr;
addr += sz;
}
}
mutex_exit(&uvm_pgflcache_lock);
}

/*
* uvm_pgflcache_start: start operation of the cache.
*
* => called once only, when init(8) is about to be started
*/

void
uvm_pgflcache_start(void)
{
CPU_INFO_ITERATOR cii;
struct cpu_info *ci;

KASSERT(uvm_pgflcache_sem > 0);

/*
* There's not much point doing this if every CPU has its own
* bucket (and that includes the uniprocessor case).
*/
if (ncpu == uvm.bucketcount) {
return;
}

/* Create data structures for each CPU. */
for (CPU_INFO_FOREACH(cii, ci)) {
uvm_pgflcache_init_cpu(ci);
}

/* Kick it into action. */
uvm_pgflcache_resume();
}

/*
* uvm_pgflcache_init: set up data structures for the free page cache.
*/

void
uvm_pgflcache_init(void)
{

uvm_pgflcache_sem = 1;
mutex_init(&uvm_pgflcache_lock, MUTEX_DEFAULT, IPL_NONE);
}

#else /* MULTIPROCESSOR */

struct vm_page *
uvm_pgflcache_alloc(struct uvm_cpu *ucpu, int fl, int c)
{

return NULL;
}

bool
uvm_pgflcache_free(struct uvm_cpu *ucpu, struct vm_page *pg)
{

return false;
}

void
uvm_pgflcache_fill(struct uvm_cpu *ucpu, int fl, int b, int c)
{

}

void
uvm_pgflcache_pause(void)
{

}

void
uvm_pgflcache_resume(void)
{

}

void
uvm_pgflcache_start(void)
{

}

void
uvm_pgflcache_init(void)
{

}

#endif /* MULTIPROCESSOR */