/*-
* Copyright (c) 2019 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* uvm_pgflcache.c: page freelist cache.
*
* This implements a tiny per-CPU cache of pages that sits between the main
* page allocator and the freelists. By allocating and freeing pages in
* batch, it reduces freelist contention by an order of magnitude.
*
* The cache can be paused & resumed at runtime so that UVM_HOTPLUG,
* uvm_pglistalloc() and uvm_page_redim() can have a consistent view of the
* world. On system with one CPU per physical package (e.g. a uniprocessor)
* the cache is not enabled.
*/
/* There is no point doing any of this on a uniprocessor. */
#ifdef MULTIPROCESSOR
/*
* MAXPGS - maximum pages per color, per bucket.
* FILLPGS - number of pages to allocate at once, per color, per bucket.
*
* Why the chosen values:
*
* (1) In 2019, an average Intel system has 4kB pages and 8x L2 cache
* colors. We make the assumption that most of the time allocation activity
* will be centered around one UVM freelist, so most of the time there will
* be no more than 224kB worth of cached pages per-CPU. That's tiny, but
* enough to hugely reduce contention on the freelist locks, and give us a
* small pool of pages which if we're very lucky may have some L1/L2 cache
* locality, and do so without subtracting too much from the L2/L3 cache
* benefits of having per-package free lists in the page allocator.
*
* (2) With the chosen values on _LP64, the data structure for each color
* takes up a single cache line (64 bytes) giving this very low overhead
* even in the "miss" case.
*
* (3) We don't want to cause too much pressure by hiding away memory that
* could otherwise be put to good use.
*/
#define MAXPGS 7
#define FILLPGS 6
static kmutex_t uvm_pgflcache_lock;
static int uvm_pgflcache_sem;
/*
* uvm_pgflcache_fill: fill specified freelist/color from global list
*
* => must be called at IPL_VM
* => must be called with given bucket lock held
* => must only fill from the correct bucket for this CPU
*/
void
uvm_pgflcache_fill(struct uvm_cpu *ucpu, int fl, int b, int c)
{
struct pgflbucket *pgb;
struct pgflcache *pc;
struct pccolor *pcc;
struct pgflist *head;
struct vm_page *pg;
int count;
/* If caching is off, then bail out. */
if (__predict_false((pc = ucpu->pgflcache[fl]) == NULL)) {
return;
}
/* Fill only to the limit. */
pcc = &pc->color[c];
pgb = uvm.page_free[fl].pgfl_buckets[b];
head = &pgb->pgb_colors[c];
if (pcc->count >= FILLPGS) {
return;
}
/* Pull pages from the bucket until it's empty, or we are full. */
count = pcc->count;
pg = LIST_FIRST(head);
while (__predict_true(pg != NULL && count < FILLPGS)) {
KASSERT(pg->flags & PG_FREE);
KASSERT(uvm_page_get_bucket(pg) == b);
pcc->pages[count++] = pg;
pg = LIST_NEXT(pg, pageq.list);
}
/* Violate LIST abstraction to remove all pages at once. */
head->lh_first = pg;
if (__predict_true(pg != NULL)) {
pg->pageq.list.le_prev = &head->lh_first;
}
pgb->pgb_nfree -= (count - pcc->count);
CPU_COUNT(CPU_COUNT_FREEPAGES, -(count - pcc->count));
pcc->count = count;
}
/*
* uvm_pgflcache_spill: spill specified freelist/color to global list
*
* => must be called at IPL_VM
* => mark __noinline so we don't pull it into uvm_pgflcache_free()
*/
static void __noinline
uvm_pgflcache_spill(struct uvm_cpu *ucpu, int fl, int c)
{
struct pgflbucket *pgb;
struct pgfreelist *pgfl;
struct pgflcache *pc;
struct pccolor *pcc;
struct pgflist *head;
kmutex_t *lock;
int b, adj;
pc = ucpu->pgflcache[fl];
pcc = &pc->color[c];
pgfl = &uvm.page_free[fl];
b = ucpu->pgflbucket;
pgb = pgfl->pgfl_buckets[b];
head = &pgb->pgb_colors[c];
lock = &uvm_freelist_locks[b].lock;
/*
* uvm_pgflcache_alloc: try to allocate a cached page.
*
* => must be called at IPL_VM
* => allocate only from the given freelist and given page color
*/
struct vm_page *
uvm_pgflcache_alloc(struct uvm_cpu *ucpu, int fl, int c)
{
struct pgflcache *pc;
struct pccolor *pcc;
struct vm_page *pg;
/* If caching is off, then bail out. */
if (__predict_false((pc = ucpu->pgflcache[fl]) == NULL)) {
return NULL;
}
/* Very simple: if we have a page then return it. */
pcc = &pc->color[c];
if (__predict_false(pcc->count == 0)) {
return NULL;
}
pg = pcc->pages[--(pcc->count)];
KASSERT(pg != NULL);
KASSERT(pg->flags == PG_FREE);
KASSERT(uvm_page_get_freelist(pg) == fl);
KASSERT(uvm_page_get_bucket(pg) == ucpu->pgflbucket);
pg->flags = PG_BUSY | PG_CLEAN | PG_FAKE;
return pg;
}
/*
* uvm_pgflcache_free: cache a page, if possible.
*
* => must be called at IPL_VM
* => must only send pages for the correct bucket for this CPU
*/
/* If caching is off, then bail out. */
fl = uvm_page_get_freelist(pg);
if (__predict_false((pc = ucpu->pgflcache[fl]) == NULL)) {
return false;
}
/* If the array is full spill it first, then add page to array. */
c = VM_PGCOLOR(pg);
pcc = &pc->color[c];
KASSERT((pg->flags & PG_FREE) == 0);
if (__predict_false(pcc->count == MAXPGS)) {
uvm_pgflcache_spill(ucpu, fl, c);
}
pg->flags = PG_FREE;
pcc->pages[pcc->count] = pg;
pcc->count++;
return true;
}
/*
* uvm_pgflcache_init: allocate and initialize per-CPU data structures for
* the free page cache. Don't set anything in motion - that's taken care
* of by uvm_pgflcache_resume().
*/
/*
* uvm_pgflcache_fini_cpu: dump all cached pages back to global free list
* and shut down caching on the CPU. Called on each CPU in the system via
* xcall.
*/
/* Last guy out takes care of business. */
mutex_enter(&uvm_pgflcache_lock);
KASSERT(uvm_pgflcache_sem > 0);
if (uvm_pgflcache_sem-- > 1) {
mutex_exit(&uvm_pgflcache_lock);
return;
}
/*
* Make sure dependant data structure updates are remotely visible.
* Essentially this functions as a global memory barrier.
*/
xc_barrier(XC_HIGHPRI);
/*
* Then set all of the pointers in place on each CPU. As soon as
* each pointer is set, caching is operational in that dimension.
*/
sz = offsetof(struct pgflcache, color[uvmexp.ncolors]);
for (CPU_INFO_FOREACH(cii, ci)) {
ucpu = ci->ci_data.cpu_uvm;
addr = roundup2((uintptr_t)ucpu->pgflcachemem, coherency_unit);
for (fl = 0; fl < VM_NFREELIST; fl++) {
ucpu->pgflcache[fl] = (struct pgflcache *)addr;
addr += sz;
}
}
mutex_exit(&uvm_pgflcache_lock);
}
/*
* uvm_pgflcache_start: start operation of the cache.
*
* => called once only, when init(8) is about to be started
*/
/*
* There's not much point doing this if every CPU has its own
* bucket (and that includes the uniprocessor case).
*/
if (ncpu == uvm.bucketcount) {
return;
}
/* Create data structures for each CPU. */
for (CPU_INFO_FOREACH(cii, ci)) {
uvm_pgflcache_init_cpu(ci);
}
/* Kick it into action. */
uvm_pgflcache_resume();
}
/*
* uvm_pgflcache_init: set up data structures for the free page cache.
*/