/*
* Copyright (c) 1997, 1999, 2000, 2002, 2007, 2008, 2010, 2014, 2015, 2018,
* 2020, 2021 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Paul Kranenburg; by Jason R. Thorpe of the Numerical Aerospace
* Simulation Facility, NASA Ames Research Center; by Andrew Doran, and by
* Maxime Villard.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Pool resource management utility.
*
* Memory is allocated in pages which are split into pieces according to
* the pool item size. Each page is kept on one of three lists in the
* pool structure: `pr_emptypages', `pr_fullpages' and `pr_partpages',
* for empty, full and partially-full pages respectively. The individual
* pool items are on a linked list headed by `ph_itemlist' in each page
* header. The memory for building the page list is either taken from
* the allocated pages themselves (for small pool items) or taken from
* an internal pool of page headers (`phpool').
*/
/* List of all pools. Non static as needed by 'vmstat -m' */
TAILQ_HEAD(, pool) pool_head = TAILQ_HEAD_INITIALIZER(pool_head);
/* Private pool for page header structures */
#define PHPOOL_MAX 8
static struct pool phpool[PHPOOL_MAX];
#define PHPOOL_FREELIST_NELEM(idx) \
(((idx) == 0) ? BITMAP_MIN_SIZE : BITMAP_SIZE * (1 << (idx)))
/*
* Pool backend allocators.
*
* Each pool has a backend allocator that handles allocation, deallocation,
* and any additional draining that might be needed.
*
* We provide two standard allocators:
*
* pool_allocator_kmem - the default when no allocator is specified
*
* pool_allocator_nointr - used for pools that will not be accessed
* in interrupt context.
*/
void *pool_page_alloc(struct pool *, int);
void pool_page_free(struct pool *, void *);
static void *pool_page_alloc_meta(struct pool *, int);
static void pool_page_free_meta(struct pool *, void *);
struct pool_item {
#ifdef POOL_CHECK_MAGIC
u_int pi_magic;
#endif
#define PI_MAGIC 0xdeaddeadU
/* Other entries use only this list entry */
LIST_ENTRY(pool_item) pi_list;
};
/*
* Pool cache management.
*
* Pool caches provide a way for constructed objects to be cached by the
* pool subsystem. This can lead to performance improvements by avoiding
* needless object construction/destruction; it is deferred until absolutely
* necessary.
*
* Caches are grouped into cache groups. Each cache group references up
* to PCG_NUMOBJECTS constructed objects. When a cache allocates an
* object from the pool, it calls the object's constructor and places it
* into a cache group. When a cache group frees an object back to the
* pool, it first calls the object's destructor. This allows the object
* to persist in constructed form while freed to the cache.
*
* The pool references each cache, so that when a pool is drained by the
* pagedaemon, it can drain each individual cache as well. Each time a
* cache is drained, the most idle cache group is freed to the pool in
* its entirety.
*
* Pool caches are laid on top of pools. By layering them, we can avoid
* the complexity of cache management for pools which would not benefit
* from it.
*/
static struct pool pcg_normal_pool;
static struct pool pcg_large_pool;
static struct pool cache_pool;
static struct pool cache_cpu_pool;
static int pool_catchup(struct pool *);
static void pool_prime_page(struct pool *, void *,
struct pool_item_header *);
static void pool_update_curpage(struct pool *);
static int pool_grow(struct pool *, int);
static void *pool_allocator_alloc(struct pool *, int);
static void pool_allocator_free(struct pool *, void *);
static inline int
phtree_compare(struct pool_item_header *a, struct pool_item_header *b)
{
/*
* We consider pool_item_header with smaller ph_page bigger. This
* unnatural ordering is for the benefit of pr_find_pagehead.
*/
if (a->ph_page < b->ph_page)
return 1;
else if (a->ph_page > b->ph_page)
return -1;
else
return 0;
}
/*
* Initialize private page header pool and cache magazine pool if we
* haven't done so yet.
*/
for (idx = 0; idx < PHPOOL_MAX; idx++) {
static char phpool_names[PHPOOL_MAX][6+1+6+1];
int nelem;
size_t sz;
static inline bool
pool_init_is_phinpage(const struct pool *pp)
{
size_t pagesize;
if (pp->pr_roflags & PR_PHINPAGE) {
return true;
}
if (pp->pr_roflags & (PR_NOTOUCH | PR_NOALIGN)) {
return false;
}
pagesize = pp->pr_alloc->pa_pagesz;
/*
* Threshold: the item size is below 1/16 of a page size, and below
* 8 times the page header size. The latter ensures we go off-page
* if the page header would make us waste a rather big item.
*/
if (pp->pr_size < MIN(pagesize / 16, PHSIZE * 8)) {
return true;
}
/* Put the header into the page if it doesn't waste any items. */
if (pagesize / pp->pr_size == (pagesize - PHSIZE) / pp->pr_size) {
return true;
}
return false;
}
static inline bool
pool_init_is_usebmap(const struct pool *pp)
{
size_t bmapsize;
if (pp->pr_roflags & PR_NOTOUCH) {
return true;
}
/*
* If we're off-page, go with a bitmap.
*/
if (!(pp->pr_roflags & PR_PHINPAGE)) {
return true;
}
/*
* If we're on-page, and the page header can already contain a bitmap
* big enough to cover all the items of the page, go with a bitmap.
*/
bmapsize = roundup(PHSIZE, pp->pr_align) -
offsetof(struct pool_item_header, ph_bitmap[0]);
KASSERT(bmapsize % sizeof(pool_item_bitmap_t) == 0);
if (pp->pr_itemsperpage <= bmapsize * CHAR_BIT) {
return true;
}
return false;
}
/*
* Initialize the given pool resource structure.
*
* We export this routine to allow other kernel parts to declare
* static pools that must be initialized before kmem(9) is available.
*/
void
pool_init(struct pool *pp, size_t size, u_int align, u_int ioff, int flags,
const char *wchan, struct pool_allocator *palloc, int ipl)
{
struct pool *pp1;
size_t prsize;
int itemspace, slack;
/* XXX ioff will be removed. */
KASSERT(ioff == 0);
#ifdef DEBUG
if (__predict_true(!cold))
mutex_enter(&pool_head_lock);
/*
* Check that the pool hasn't already been initialised and
* added to the list of all pools.
*/
TAILQ_FOREACH(pp1, &pool_head, pr_poollist) {
if (pp == pp1)
panic("%s: [%s] already initialised", __func__,
wchan);
}
if (__predict_true(!cold))
mutex_exit(&pool_head_lock);
#endif
if (palloc == NULL) {
if (size > PAGE_SIZE) {
int bigidx = pool_bigidx(size);
/*
* PR_PSERIALIZE implies PR_NOTOUCH; freed objects must remain
* valid until the the backing page is returned to the system.
*/
if (flags & PR_PSERIALIZE) {
flags |= PR_NOTOUCH;
}
/*
* Decide whether to put the page header off-page to avoid wasting too
* large a part of the page or too big an item. Off-page page headers
* go on a hash table, so we can match a returned item with its header
* based on the page address.
*/
if (pool_init_is_phinpage(pp)) {
/* Use the beginning of the page for the page header */
itemspace = palloc->pa_pagesz - roundup(PHSIZE, align);
pp->pr_itemoffset = roundup(PHSIZE, align);
pp->pr_roflags |= PR_PHINPAGE;
} else {
/* The page header will be taken from our page header pool */
itemspace = palloc->pa_pagesz;
pp->pr_itemoffset = 0;
SPLAY_INIT(&pp->pr_phtree);
}
/*
* Decide whether to use a bitmap or a linked list to manage freed
* items.
*/
if (pool_init_is_usebmap(pp)) {
pp->pr_roflags |= PR_USEBMAP;
}
/*
* If we're off-page, then we're using a bitmap; choose the appropriate
* pool to allocate page headers, whose size varies depending on the
* bitmap. If we're on-page, nothing to do.
*/
if (!(pp->pr_roflags & PR_PHINPAGE)) {
int idx;
KASSERT(pp->pr_roflags & PR_USEBMAP);
for (idx = 0; pp->pr_itemsperpage > PHPOOL_FREELIST_NELEM(idx);
idx++) {
/* nothing */
}
if (idx >= PHPOOL_MAX) {
/*
* if you see this panic, consider to tweak
* PHPOOL_MAX and PHPOOL_FREELIST_NELEM.
*/
panic("%s: [%s] too large itemsperpage(%d) for "
"PR_USEBMAP", __func__,
pp->pr_wchan, pp->pr_itemsperpage);
}
pp->pr_phpool = &phpool[idx];
} else {
pp->pr_phpool = NULL;
}
/*
* Use the slack between the chunks and the page header
* for "cache coloring".
*/
slack = itemspace - pp->pr_itemsperpage * pp->pr_size;
pp->pr_maxcolor = rounddown(slack, align);
pp->pr_curcolor = 0;
/* Insert into the list of all pools. */
if (!cold)
mutex_enter(&pool_head_lock);
TAILQ_FOREACH(pp1, &pool_head, pr_poollist) {
if (strcmp(pp1->pr_wchan, pp->pr_wchan) > 0)
break;
}
if (pp1 == NULL)
TAILQ_INSERT_TAIL(&pool_head, pp, pr_poollist);
else
TAILQ_INSERT_BEFORE(pp1, pp, pr_poollist);
if (!cold)
mutex_exit(&pool_head_lock);
/* Insert this into the list of pools using this allocator. */
if (!cold)
mutex_enter(&palloc->pa_lock);
TAILQ_INSERT_TAIL(&palloc->pa_list, pp, pr_alloc_list);
if (!cold)
mutex_exit(&palloc->pa_lock);
}
/*
* De-commission a pool resource.
*/
void
pool_destroy(struct pool *pp)
{
struct pool_pagelist pq;
struct pool_item_header *ph;
pool_quarantine_flush(pp);
/* Remove from global pool list */
mutex_enter(&pool_head_lock);
while (pp->pr_refcnt != 0)
cv_wait(&pool_busy, &pool_head_lock);
TAILQ_REMOVE(&pool_head, pp, pr_poollist);
if (drainpp == pp)
drainpp = NULL;
mutex_exit(&pool_head_lock);
/* Remove this pool from its allocator's list of pools. */
mutex_enter(&pp->pr_alloc->pa_lock);
TAILQ_REMOVE(&pp->pr_alloc->pa_list, pp, pr_alloc_list);
mutex_exit(&pp->pr_alloc->pa_lock);
mutex_enter(&pool_allocator_lock);
if (--pp->pr_alloc->pa_refcnt == 0)
mutex_destroy(&pp->pr_alloc->pa_lock);
mutex_exit(&pool_allocator_lock);
mutex_enter(&pp->pr_lock);
KASSERT(pp->pr_cache == NULL);
KASSERTMSG((pp->pr_nout == 0),
"%s: [%s] pool busy: still out: %u", __func__, pp->pr_wchan,
pp->pr_nout);
KASSERT(LIST_EMPTY(&pp->pr_fullpages));
KASSERT(LIST_EMPTY(&pp->pr_partpages));
/* Remove all pages */
LIST_INIT(&pq);
while ((ph = LIST_FIRST(&pp->pr_emptypages)) != NULL)
pr_rmpage(pp, ph, &pq);
/* XXX no locking -- must be used just after pool_init() */
KASSERTMSG((pp->pr_drain_hook == NULL),
"%s: [%s] already set", __func__, pp->pr_wchan);
pp->pr_drain_hook = fn;
pp->pr_drain_hook_arg = arg;
}
static struct pool_item_header *
pool_alloc_item_header(struct pool *pp, void *storage, int flags)
{
struct pool_item_header *ph;
/*
* Grab an item from the pool.
*/
void *
pool_get(struct pool *pp, int flags)
{
struct pool_item_header *ph;
void *v;
KASSERT(!(flags & PR_NOWAIT) != !(flags & PR_WAITOK));
KASSERTMSG((pp->pr_itemsperpage != 0),
"%s: [%s] pr_itemsperpage is zero, "
"pool not initialized?", __func__, pp->pr_wchan);
KASSERTMSG((!(cpu_intr_p() || cpu_softintr_p())
|| pp->pr_ipl != IPL_NONE || cold || panicstr != NULL),
"%s: [%s] is IPL_NONE, but called from interrupt context",
__func__, pp->pr_wchan);
if (flags & PR_WAITOK) {
ASSERT_SLEEPABLE();
}
if (flags & PR_NOWAIT) {
if (fault_inject())
return NULL;
}
mutex_enter(&pp->pr_lock);
startover:
/*
* Check to see if we've reached the hard limit. If we have,
* and we can wait, then wait until an item has been returned to
* the pool.
*/
KASSERTMSG((pp->pr_nout <= pp->pr_hardlimit),
"%s: %s: crossed hard limit", __func__, pp->pr_wchan);
if (__predict_false(pp->pr_nout == pp->pr_hardlimit)) {
if (pp->pr_drain_hook != NULL) {
/*
* Since the drain hook is going to free things
* back to the pool, unlock, call the hook, re-lock,
* and check the hardlimit condition again.
*/
mutex_exit(&pp->pr_lock);
(*pp->pr_drain_hook)(pp->pr_drain_hook_arg, flags);
mutex_enter(&pp->pr_lock);
if (pp->pr_nout < pp->pr_hardlimit)
goto startover;
}
if ((flags & PR_WAITOK) && !(flags & PR_LIMITFAIL)) {
/*
* XXX: A warning isn't logged in this case. Should
* it be?
*/
pp->pr_flags |= PR_WANTED;
do {
cv_wait(&pp->pr_cv, &pp->pr_lock);
} while (pp->pr_flags & PR_WANTED);
goto startover;
}
/*
* Log a message that the hard limit has been hit.
*/
if (pp->pr_hardlimit_warning != NULL &&
ratecheck(&pp->pr_hardlimit_warning_last,
&pp->pr_hardlimit_ratecap))
log(LOG_ERR, "%s\n", pp->pr_hardlimit_warning);
/*
* The convention we use is that if `curpage' is not NULL, then
* it points at a non-empty bucket. In particular, `curpage'
* never points at a page header which has PR_PHINPAGE set and
* has no items in its bucket.
*/
if ((ph = pp->pr_curpage) == NULL) {
int error;
/*
* Call the back-end page allocator for more memory.
* Release the pool lock, as the back-end page allocator
* may block.
*/
error = pool_grow(pp, flags);
if (error != 0) {
/*
* pool_grow aborts when another thread
* is allocating a new page. Retry if it
* waited for it.
*/
if (error == ERESTART)
goto startover;
/*
* We were unable to allocate a page or item
* header, but we released the lock during
* allocation, so perhaps items were freed
* back to the pool. Check for this case.
*/
if (pp->pr_curpage != NULL)
goto startover;
/* Start the allocation process over. */
goto startover;
}
if (pp->pr_roflags & PR_USEBMAP) {
KASSERTMSG((ph->ph_nmissing < pp->pr_itemsperpage),
"%s: [%s] pool page empty", __func__, pp->pr_wchan);
v = pr_item_bitmap_get(pp, ph);
} else {
v = pr_item_linkedlist_get(pp, ph);
}
pp->pr_nitems--;
pp->pr_nout++;
if (ph->ph_nmissing == 0) {
KASSERT(pp->pr_nidle > 0);
pp->pr_nidle--;
/*
* This page was previously empty. Move it to the list of
* partially-full pages. This page is already curpage.
*/
LIST_REMOVE(ph, ph_pagelist);
LIST_INSERT_HEAD(&pp->pr_partpages, ph, ph_pagelist);
}
ph->ph_nmissing++;
if (ph->ph_nmissing == pp->pr_itemsperpage) {
KASSERTMSG(((pp->pr_roflags & PR_USEBMAP) ||
LIST_EMPTY(&ph->ph_itemlist)),
"%s: [%s] nmissing (%u) inconsistent", __func__,
pp->pr_wchan, ph->ph_nmissing);
/*
* This page is now full. Move it to the full list
* and select a new current page.
*/
LIST_REMOVE(ph, ph_pagelist);
LIST_INSERT_HEAD(&pp->pr_fullpages, ph, ph_pagelist);
pool_update_curpage(pp);
}
pp->pr_nget++;
/*
* If we have a low water mark and we are now below that low
* water mark, add more items to the pool.
*/
if (POOL_NEEDS_CATCHUP(pp) && pool_catchup(pp) != 0) {
/*
* XXX: Should we log a warning? Should we set up a timeout
* to try again in a second or so? The latter could break
* a caller's assumptions about interrupt protection, etc.
*/
}
/*
* Internal version of pool_put(). Pool is already locked/entered.
*/
static void
pool_do_put(struct pool *pp, void *v, struct pool_pagelist *pq)
{
struct pool_item_header *ph;
/* Cancel "pool empty" condition if it exists */
if (pp->pr_curpage == NULL)
pp->pr_curpage = ph;
if (pp->pr_flags & PR_WANTED) {
pp->pr_flags &= ~PR_WANTED;
cv_broadcast(&pp->pr_cv);
}
/*
* If this page is now empty, do one of two things:
*
* (1) If we have more pages than the page high water mark,
* free the page back to the system. ONLY CONSIDER
* FREEING BACK A PAGE IF WE HAVE MORE THAN OUR MINIMUM PAGE
* CLAIM.
*
* (2) Otherwise, move the page to the empty page list.
*
* Either way, select a new current page (so we use a partially-full
* page if one is available).
*/
if (ph->ph_nmissing == 0) {
pp->pr_nidle++;
if (pp->pr_nitems - pp->pr_itemsperpage >= pp->pr_minitems &&
pp->pr_npages > pp->pr_minpages &&
(pp->pr_npages > pp->pr_maxpages ||
pp->pr_nitems > pp->pr_maxitems)) {
pr_rmpage(pp, ph, pq);
} else {
LIST_REMOVE(ph, ph_pagelist);
LIST_INSERT_HEAD(&pp->pr_emptypages, ph, ph_pagelist);
/*
* Update the timestamp on the page. A page must
* be idle for some period of time before it can
* be reclaimed by the pagedaemon. This minimizes
* ping-pong'ing for memory.
*
* note for 64-bit time_t: truncating to 32-bit is not
* a problem for our usage.
*/
ph->ph_time = time_uptime;
}
pool_update_curpage(pp);
}
/*
* If the page was previously completely full, move it to the
* partially-full list and make it the current page. The next
* allocation will get the item from this page, instead of
* further fragmenting the pool.
*/
else if (ph->ph_nmissing == (pp->pr_itemsperpage - 1)) {
LIST_REMOVE(ph, ph_pagelist);
LIST_INSERT_HEAD(&pp->pr_partpages, ph, ph_pagelist);
pp->pr_curpage = ph;
}
}
void
pool_put(struct pool *pp, void *v)
{
struct pool_pagelist pq;
/*
* pool_grow: grow a pool by a page.
*
* => called with pool locked.
* => unlock and relock the pool.
* => return with pool locked.
*/
static int
pool_grow(struct pool *pp, int flags)
{
struct pool_item_header *ph;
char *storage;
/*
* If there's a pool_grow in progress, wait for it to complete
* and try again from the top.
*/
if (pp->pr_flags & PR_GROWING) {
if (flags & PR_WAITOK) {
do {
cv_wait(&pp->pr_cv, &pp->pr_lock);
} while (pp->pr_flags & PR_GROWING);
return ERESTART;
} else {
if (pp->pr_flags & PR_GROWINGNOWAIT) {
/*
* This needs an unlock/relock dance so
* that the other caller has a chance to
* run and actually do the thing. Note
* that this is effectively a busy-wait.
*/
mutex_exit(&pp->pr_lock);
mutex_enter(&pp->pr_lock);
return ERESTART;
}
return EWOULDBLOCK;
}
}
pp->pr_flags |= PR_GROWING;
if (flags & PR_WAITOK)
mutex_exit(&pp->pr_lock);
else
pp->pr_flags |= PR_GROWINGNOWAIT;
storage = pool_allocator_alloc(pp, flags);
if (__predict_false(storage == NULL))
goto out;
if (flags & PR_WAITOK)
mutex_enter(&pp->pr_lock);
pool_prime_page(pp, storage, ph);
pp->pr_npagealloc++;
KASSERT(pp->pr_flags & PR_GROWING);
pp->pr_flags &= ~(PR_GROWING|PR_GROWINGNOWAIT);
/*
* If anyone was waiting for pool_grow, notify them that we
* may have just done it.
*/
cv_broadcast(&pp->pr_cv);
return 0;
out:
if (flags & PR_WAITOK)
mutex_enter(&pp->pr_lock);
KASSERT(pp->pr_flags & PR_GROWING);
pp->pr_flags &= ~(PR_GROWING|PR_GROWINGNOWAIT);
return ENOMEM;
}
/*
* Add a page worth of items to the pool.
*
* Note, we must be called with the pool descriptor LOCKED.
*/
static void
pool_prime_page(struct pool *pp, void *storage, struct pool_item_header *ph)
{
const unsigned int align = pp->pr_align;
struct pool_item *pi;
void *cp = storage;
int n;
/*
* If the pool was depleted, point at the new page.
*/
if (pp->pr_curpage == NULL)
pp->pr_curpage = ph;
if (++pp->pr_npages > pp->pr_hiwat)
pp->pr_hiwat = pp->pr_npages;
}
/*
* Used by pool_get() when nitems drops below the low water mark. This
* is used to catch up pr_nitems with the low water mark.
*
* Note 1, we never wait for memory here, we let the caller decide what to do.
*
* Note 2, we must be called with the pool already locked, and we return
* with it locked.
*/
static int
pool_catchup(struct pool *pp)
{
int error = 0;
while (POOL_NEEDS_CATCHUP(pp)) {
error = pool_grow(pp, PR_NOWAIT);
if (error) {
if (error == ERESTART)
continue;
break;
}
}
return error;
}
static void
pool_update_curpage(struct pool *pp)
{
/* Make sure we're caught up with the newly-set low water mark. */
if (POOL_NEEDS_CATCHUP(pp) && pool_catchup(pp) != 0) {
/*
* XXX: Should we log a warning? Should we set up a timeout
* to try again in a second or so? The latter could break
* a caller's assumptions about interrupt protection, etc.
*/
}
mutex_exit(&pp->pr_lock);
}
void
pool_sethiwat(struct pool *pp, int n)
{
mutex_enter(&pp->pr_lock);
pp->pr_maxitems = n;
mutex_exit(&pp->pr_lock);
}
void
pool_sethardlimit(struct pool *pp, int n, const char *warnmess, int ratecap)
{
/*
* Release all complete pages that have not been used recently.
*
* Must not be called from interrupt context.
*/
int
pool_reclaim(struct pool *pp)
{
struct pool_item_header *ph, *phnext;
struct pool_pagelist pq;
struct pool_cache *pc;
uint32_t curtime;
bool klock;
int rv;
if (pp->pr_drain_hook != NULL) {
/*
* The drain hook must be called with the pool unlocked.
*/
(*pp->pr_drain_hook)(pp->pr_drain_hook_arg, PR_NOWAIT);
}
/*
* XXXSMP Because we do not want to cause non-MPSAFE code
* to block.
*/
if (pp->pr_ipl == IPL_SOFTNET || pp->pr_ipl == IPL_SOFTCLOCK ||
pp->pr_ipl == IPL_SOFTSERIAL) {
KERNEL_LOCK(1, NULL);
klock = true;
} else
klock = false;
/* Reclaim items from the pool's cache (if any). */
if ((pc = atomic_load_consume(&pp->pr_cache)) != NULL)
pool_cache_invalidate(pc);
if (mutex_tryenter(&pp->pr_lock) == 0) {
if (klock) {
KERNEL_UNLOCK_ONE(NULL);
}
return 0;
}
KASSERT(ph->ph_nmissing == 0);
if (curtime - ph->ph_time < pool_inactive_time)
continue;
/*
* If freeing this page would put us below the minimum free items
* or the minimum pages, stop now.
*/
if (pp->pr_nitems - pp->pr_itemsperpage < pp->pr_minitems ||
pp->pr_npages - 1 < pp->pr_minpages)
break;
/*
* Drain pools, one at a time. The drained pool is returned within ppp.
*
* Note, must never be called from interrupt context.
*/
bool
pool_drain(struct pool **ppp)
{
bool reclaimed;
struct pool *pp;
KASSERT(!TAILQ_EMPTY(&pool_head));
pp = NULL;
/* Find next pool to drain, and add a reference. */
mutex_enter(&pool_head_lock);
do {
if (drainpp == NULL) {
drainpp = TAILQ_FIRST(&pool_head);
}
if (drainpp != NULL) {
pp = drainpp;
drainpp = TAILQ_NEXT(pp, pr_poollist);
}
/*
* Skip completely idle pools. We depend on at least
* one pool in the system being active.
*/
} while (pp == NULL || pp->pr_npages == 0);
pp->pr_refcnt++;
mutex_exit(&pool_head_lock);
/* Drain the cache (if any) and pool.. */
reclaimed = pool_reclaim(pp);
/* Finally, unlock the pool. */
mutex_enter(&pool_head_lock);
pp->pr_refcnt--;
cv_broadcast(&pool_busy);
mutex_exit(&pool_head_lock);
if (ppp != NULL)
*ppp = pp;
return reclaimed;
}
/*
* Calculate the total number of pages consumed by pools.
*/
int
pool_totalpages(void)
{
mutex_enter(&pool_head_lock);
int pages = pool_totalpages_locked();
mutex_exit(&pool_head_lock);
return pages;
}
int
pool_totalpages_locked(void)
{
struct pool *pp;
uint64_t total = 0;
/*
* pool_cache_bootstrap:
*
* Kernel-private version of pool_cache_init(). The caller
* provides initial storage.
*/
void
pool_cache_bootstrap(pool_cache_t pc, size_t size, u_int align,
u_int align_offset, u_int flags, const char *wchan,
struct pool_allocator *palloc, int ipl,
int (*ctor)(void *, void *, int), void (*dtor)(void *, void *),
void *arg)
{
CPU_INFO_ITERATOR cii;
pool_cache_t pc1;
struct cpu_info *ci;
struct pool *pp;
unsigned int ppflags;
pp = &pc->pc_pool;
ppflags = flags;
if (ctor == NULL) {
ctor = NO_CTOR;
}
if (dtor == NULL) {
dtor = NO_DTOR;
} else {
/*
* If we have a destructor, then the pool layer does not
* need to worry about PR_PSERIALIZE.
*/
ppflags &= ~PR_PSERIALIZE;
}
/* Allocate per-CPU caches. */
memset(pc->pc_cpus, 0, sizeof(pc->pc_cpus));
pc->pc_ncpu = 0;
if (ncpu < 2) {
/* XXX For sparc: boot CPU is not attached yet. */
pool_cache_cpu_init1(curcpu(), pc);
} else {
for (CPU_INFO_FOREACH(cii, ci)) {
pool_cache_cpu_init1(ci, pc);
}
}
/* Add to list of all pools. */
if (__predict_true(!cold))
mutex_enter(&pool_head_lock);
TAILQ_FOREACH(pc1, &pool_cache_head, pc_cachelist) {
if (strcmp(pc1->pc_pool.pr_wchan, pc->pc_pool.pr_wchan) > 0)
break;
}
if (pc1 == NULL)
TAILQ_INSERT_TAIL(&pool_cache_head, pc, pc_cachelist);
else
TAILQ_INSERT_BEFORE(pc1, pc, pc_cachelist);
if (__predict_true(!cold))
mutex_exit(&pool_head_lock);
atomic_store_release(&pp->pr_cache, pc);
}
/*
* pool_cache_destroy:
*
* Destroy a pool cache.
*/
void
pool_cache_destroy(pool_cache_t pc)
{
/*
* pool_cache_bootstrap_destroy:
*
* Destroy a pool cache.
*/
void
pool_cache_bootstrap_destroy(pool_cache_t pc)
{
struct pool *pp = &pc->pc_pool;
u_int i;
/* Remove it from the global list. */
mutex_enter(&pool_head_lock);
while (pc->pc_refcnt != 0)
cv_wait(&pool_busy, &pool_head_lock);
TAILQ_REMOVE(&pool_cache_head, pc, pc_cachelist);
mutex_exit(&pool_head_lock);
/* First, invalidate the entire cache. */
pool_cache_invalidate(pc);
/* Disassociate it from the pool. */
mutex_enter(&pp->pr_lock);
atomic_store_relaxed(&pp->pr_cache, NULL);
mutex_exit(&pp->pr_lock);
/* Destroy per-CPU data */
for (i = 0; i < __arraycount(pc->pc_cpus); i++)
pool_cache_invalidate_cpu(pc, i);
/* Finally, destroy it. */
pool_destroy(pp);
}
/*
* pool_cache_cpu_init1:
*
* Called for each pool_cache whenever a new CPU is attached.
*/
static void
pool_cache_cpu_init1(struct cpu_info *ci, pool_cache_t pc)
{
pool_cache_cpu_t *cc;
int index;
index = ci->ci_index;
KASSERT(index < __arraycount(pc->pc_cpus));
if ((cc = pc->pc_cpus[index]) != NULL) {
return;
}
/*
* The first CPU is 'free'. This needs to be the case for
* bootstrap - we may not be able to allocate yet.
*/
if (pc->pc_ncpu == 0) {
cc = &pc->pc_cpu0;
pc->pc_ncpu = 1;
} else {
pc->pc_ncpu++;
cc = pool_get(&cache_cpu_pool, PR_WAITOK);
}
/*
* pool_cache_reclaim:
*
* Reclaim memory from a pool cache.
*/
bool
pool_cache_reclaim(pool_cache_t pc)
{
return pool_reclaim(&pc->pc_pool);
}
static inline void
pool_cache_pre_destruct(pool_cache_t pc)
{
/*
* Perform a passive serialization barrier before destructing
* a batch of one or more objects.
*/
if (__predict_false(pc_has_pser(pc))) {
pool_barrier();
}
}
/*
* pool_cache_destruct_object:
*
* Force destruction of an object and its release back into
* the pool.
*/
void
pool_cache_destruct_object(pool_cache_t pc, void *object)
{
/*
* pool_cache_invalidate_groups:
*
* Invalidate a chain of groups and destruct all objects. Return the
* number of groups that were invalidated.
*/
static int
pool_cache_invalidate_groups(pool_cache_t pc, pcg_t *pcg)
{
void *object;
pcg_t *next;
int i, n;
if (pcg == NULL) {
return 0;
}
pool_cache_pre_destruct(pc);
for (n = 0; pcg != NULL; pcg = next, n++) {
next = pcg->pcg_next;
for (i = 0; i < pcg->pcg_avail; i++) {
object = pcg->pcg_objects[i].pcgo_va;
pool_cache_destruct_object1(pc, object);
}
/*
* pool_cache_invalidate:
*
* Invalidate a pool cache (destruct and release all of the
* cached objects). Does not reclaim objects from the pool.
*
* Note: For pool caches that provide constructed objects, there
* is an assumption that another level of synchronization is occurring
* between the input to the constructor and the cache invalidation.
*
* Invalidation is a costly process and should not be called from
* interrupt context.
*/
void
pool_cache_invalidate(pool_cache_t pc)
{
uint64_t where;
pcg_t *pcg;
int n, s;
if (ncpu < 2 || !mp_online) {
/*
* We might be called early enough in the boot process
* for the CPU data structures to not be fully initialized.
* In this case, transfer the content of the local CPU's
* cache back into global cache as only this CPU is currently
* running.
*/
pool_cache_transfer(pc);
} else {
/*
* Signal all CPUs that they must transfer their local
* cache back to the global pool then wait for the xcall to
* complete.
*/
where = xc_broadcast(0,
__FPTRCAST(xcfunc_t, pool_cache_transfer), pc, NULL);
xc_wait(where);
}
/* Now dequeue and invalidate everything. */
pcg = pool_pcg_trunc(&pcg_normal_cache);
(void)pool_cache_invalidate_groups(pc, pcg);
pcg = pool_pcg_trunc(&pc->pc_fullgroups);
n = pool_cache_invalidate_groups(pc, pcg);
s = splvm();
((pool_cache_cpu_t *)pc->pc_cpus[curcpu()->ci_index])->cc_nfull -= n;
splx(s);
pcg = pool_pcg_trunc(&pc->pc_partgroups);
n = pool_cache_invalidate_groups(pc, pcg);
s = splvm();
((pool_cache_cpu_t *)pc->pc_cpus[curcpu()->ci_index])->cc_npart -= n;
splx(s);
}
/*
* pool_cache_invalidate_cpu:
*
* Invalidate all CPU-bound cached objects in pool cache, the CPU being
* identified by its associated index.
* It is caller's responsibility to ensure that no operation is
* taking place on this pool cache while doing this invalidation.
* WARNING: as no inter-CPU locking is enforced, trying to invalidate
* pool cached objects from a CPU different from the one currently running
* may result in an undefined behaviour.
*/
static void
pool_cache_invalidate_cpu(pool_cache_t pc, u_int index)
{
pool_cache_cpu_t *cc;
pcg_t *pcg;
void
pool_cache_setlowat(pool_cache_t pc, int n)
{
pool_setlowat(&pc->pc_pool, n);
}
void
pool_cache_sethiwat(pool_cache_t pc, int n)
{
pool_sethiwat(&pc->pc_pool, n);
}
void
pool_cache_sethardlimit(pool_cache_t pc, int n, const char *warnmess, int ratecap)
{
pool_sethardlimit(&pc->pc_pool, n, warnmess, ratecap);
}
void
pool_cache_prime(pool_cache_t pc, int n)
{
pool_prime(&pc->pc_pool, n);
}
unsigned int
pool_cache_nget(pool_cache_t pc)
{
return pool_nget(&pc->pc_pool);
}
unsigned int
pool_cache_nput(pool_cache_t pc)
{
return pool_nput(&pc->pc_pool);
}
/*
* pool_pcg_get:
*
* Get a cache group from the specified list. Return true if
* contention was encountered. Must be called at IPL_VM because
* of spin wait vs. kernel_lock.
*/
static int
pool_pcg_get(pcg_t *volatile *head, pcg_t **pcgp)
{
int count = SPINLOCK_BACKOFF_MIN;
pcg_t *o, *n;
for (o = atomic_load_relaxed(head);; o = n) {
if (__predict_false(o == &pcg_dummy)) {
/* Wait for concurrent get to complete. */
SPINLOCK_BACKOFF(count);
n = atomic_load_relaxed(head);
continue;
}
if (__predict_false(o == NULL)) {
break;
}
/* Lock out concurrent get/put. */
n = atomic_cas_ptr(head, o, __UNCONST(&pcg_dummy));
if (o == n) {
/* Fetch pointer to next item and then unlock. */
membar_datadep_consumer(); /* alpha */
n = atomic_load_relaxed(&o->pcg_next);
atomic_store_release(head, n);
break;
}
}
*pcgp = o;
return count != SPINLOCK_BACKOFF_MIN;
}
/*
* pool_pcg_trunc:
*
* Chop out entire list of pool cache groups.
*/
static pcg_t *
pool_pcg_trunc(pcg_t *volatile *head)
{
int count = SPINLOCK_BACKOFF_MIN, s;
pcg_t *o, *n;
s = splvm();
for (o = atomic_load_relaxed(head);; o = n) {
if (__predict_false(o == &pcg_dummy)) {
/* Wait for concurrent get to complete. */
SPINLOCK_BACKOFF(count);
n = atomic_load_relaxed(head);
continue;
}
n = atomic_cas_ptr(head, o, NULL);
if (o == n) {
splx(s);
membar_datadep_consumer(); /* alpha */
return o;
}
}
}
/*
* pool_pcg_put:
*
* Put a pool cache group to the specified list. Return true if
* contention was encountered. Must be called at IPL_VM because of
* spin wait vs. kernel_lock.
*/
static int
pool_pcg_put(pcg_t *volatile *head, pcg_t *pcg)
{
int count = SPINLOCK_BACKOFF_MIN;
pcg_t *o, *n;
for (o = atomic_load_relaxed(head);; o = n) {
if (__predict_false(o == &pcg_dummy)) {
/* Wait for concurrent get to complete. */
SPINLOCK_BACKOFF(count);
n = atomic_load_relaxed(head);
continue;
}
pcg->pcg_next = o;
membar_release();
n = atomic_cas_ptr(head, o, pcg);
if (o == n) {
return count != SPINLOCK_BACKOFF_MIN;
}
}
}
/*
* If there's a full group, release our empty group back to the
* cache. Install the full group as cc_current and return.
*/
cc->cc_contended += pool_pcg_get(&pc->pc_fullgroups, &pcg);
if (__predict_true(pcg != NULL)) {
KASSERT(pcg->pcg_avail == pcg->pcg_size);
if (__predict_true((cur = cc->cc_current) != &pcg_dummy)) {
KASSERT(cur->pcg_avail == 0);
(void)pool_pcg_put(cc->cc_pcgcache, cur);
}
cc->cc_nfull--;
cc->cc_current = pcg;
return true;
}
/*
* Nothing available locally or in cache. Take the slow
* path: fetch a new object from the pool and construct
* it.
*/
cc->cc_pcmisses++;
splx(s);
/*
* pool_cache_get{,_paddr}:
*
* Get an object from a pool cache (optionally returning
* the physical address of the object).
*/
void *
pool_cache_get_paddr(pool_cache_t pc, int flags, paddr_t *pap)
{
pool_cache_cpu_t *cc;
pcg_t *pcg;
void *object;
int s;
KASSERT(!(flags & PR_NOWAIT) != !(flags & PR_WAITOK));
if (pc->pc_pool.pr_ipl == IPL_NONE &&
__predict_true(!cold) &&
__predict_true(panicstr == NULL)) {
KASSERTMSG(!cpu_intr_p(),
"%s: [%s] is IPL_NONE, but called from interrupt context",
__func__, pc->pc_pool.pr_wchan);
KASSERTMSG(!cpu_softintr_p(),
"%s: [%s] is IPL_NONE,"
" but called from soft interrupt context",
__func__, pc->pc_pool.pr_wchan);
}
if (flags & PR_WAITOK) {
ASSERT_SLEEPABLE();
}
if (flags & PR_NOWAIT) {
if (fault_inject())
return NULL;
}
/* Lock out interrupts and disable preemption. */
s = splvm();
while (/* CONSTCOND */ true) {
/* Try and allocate an object from the current group. */
cc = pc->pc_cpus[curcpu()->ci_index];
pcg = cc->cc_current;
if (__predict_true(pcg->pcg_avail > 0)) {
object = pcg->pcg_objects[--pcg->pcg_avail].pcgo_va;
if (__predict_false(pap != NULL))
*pap = pcg->pcg_objects[pcg->pcg_avail].pcgo_pa;
#if defined(DIAGNOSTIC)
pcg->pcg_objects[pcg->pcg_avail].pcgo_va = NULL;
KASSERT(pcg->pcg_avail < pcg->pcg_size);
KASSERT(object != NULL);
#endif
cc->cc_hits++;
splx(s);
FREECHECK_OUT(&pc->pc_freecheck, object);
pool_redzone_fill(&pc->pc_pool, object);
pool_cache_get_kmsan(pc, object);
return object;
}
/*
* That failed. If the previous group isn't empty, swap
* it with the current group and allocate from there.
*/
pcg = cc->cc_previous;
if (__predict_true(pcg->pcg_avail > 0)) {
cc->cc_previous = cc->cc_current;
cc->cc_current = pcg;
continue;
}
/*
* Can't allocate from either group: try the slow path.
* If get_slow() allocated an object for us, or if
* no more objects are available, it will return false.
* Otherwise, we need to retry.
*/
if (!pool_cache_get_slow(pc, cc, s, &object, pap, flags)) {
if (object != NULL) {
kmsan_orig(object, pc->pc_pool.pr_size,
KMSAN_TYPE_POOL, __RET_ADDR);
}
break;
}
}
/*
* We would like to KASSERT(object || (flags & PR_NOWAIT)), but
* pool_cache_get can fail even in the PR_WAITOK case, if the
* constructor fails.
*/
return object;
}
/*
* Try to get an empty group from the cache. If there are no empty
* groups in the cache then allocate one.
*/
(void)pool_pcg_get(cc->cc_pcgcache, &pcg);
if (__predict_false(pcg == NULL)) {
if (__predict_true(!pool_cache_disable)) {
pcg = pool_get(pc->pc_pcgpool, PR_NOWAIT);
}
if (__predict_true(pcg != NULL)) {
pcg->pcg_avail = 0;
pcg->pcg_size = pc->pc_pcgsize;
}
}
/*
* If there's a empty group, release our full group back to the
* cache. Install the empty group to the local CPU and return.
*/
if (pcg != NULL) {
KASSERT(pcg->pcg_avail == 0);
if (__predict_false(cc->cc_previous == &pcg_dummy)) {
cc->cc_previous = pcg;
} else {
cur = cc->cc_current;
if (__predict_true(cur != &pcg_dummy)) {
KASSERT(cur->pcg_avail == cur->pcg_size);
cc->cc_contended +=
pool_pcg_put(&pc->pc_fullgroups, cur);
cc->cc_nfull++;
}
cc->cc_current = pcg;
}
return true;
}
/*
* Nothing available locally or in cache, and we didn't
* allocate an empty group. Take the slow path and destroy
* the object here and now.
*/
cc->cc_pcmisses++;
splx(s);
pool_cache_destruct_object(pc, object);
return false;
}
/*
* pool_cache_put{,_paddr}:
*
* Put an object back to the pool cache (optionally caching the
* physical address of the object).
*/
void
pool_cache_put_paddr(pool_cache_t pc, void *object, paddr_t pa)
{
pool_cache_cpu_t *cc;
pcg_t *pcg;
int s;
if (pc->pc_pool.pr_roflags & PR_PHINPAGE) {
pc_phinpage_check(pc, object);
}
if (pool_cache_put_nocache(pc, object)) {
return;
}
/* Lock out interrupts and disable preemption. */
s = splvm();
while (/* CONSTCOND */ true) {
/* If the current group isn't full, release it there. */
cc = pc->pc_cpus[curcpu()->ci_index];
pcg = cc->cc_current;
if (__predict_true(pcg->pcg_avail < pcg->pcg_size)) {
pcg->pcg_objects[pcg->pcg_avail].pcgo_va = object;
pcg->pcg_objects[pcg->pcg_avail].pcgo_pa = pa;
pcg->pcg_avail++;
cc->cc_hits++;
splx(s);
return;
}
/*
* That failed. If the previous group isn't full, swap
* it with the current group and try again.
*/
pcg = cc->cc_previous;
if (__predict_true(pcg->pcg_avail < pcg->pcg_size)) {
cc->cc_previous = cc->cc_current;
cc->cc_current = pcg;
continue;
}
/*
* Can't free to either group: try the slow path.
* If put_slow() releases the object for us, it
* will return false. Otherwise we need to retry.
*/
if (!pool_cache_put_slow(pc, cc, s, object))
break;
}
}
/*
* pool_cache_transfer:
*
* Transfer objects from the per-CPU cache to the global cache.
* Run within a cross-call thread.
*/
static void
pool_cache_transfer(pool_cache_t pc)
{
pool_cache_cpu_t *cc;
pcg_t *prev, *cur;
int s;
s = splvm();
cc = pc->pc_cpus[curcpu()->ci_index];
cur = cc->cc_current;
cc->cc_current = __UNCONST(&pcg_dummy);
prev = cc->cc_previous;
cc->cc_previous = __UNCONST(&pcg_dummy);
if (cur != &pcg_dummy) {
if (cur->pcg_avail == cur->pcg_size) {
(void)pool_pcg_put(&pc->pc_fullgroups, cur);
cc->cc_nfull++;
} else if (cur->pcg_avail == 0) {
(void)pool_pcg_put(pc->pc_pcgcache, cur);
} else {
(void)pool_pcg_put(&pc->pc_partgroups, cur);
cc->cc_npart++;
}
}
if (prev != &pcg_dummy) {
if (prev->pcg_avail == prev->pcg_size) {
(void)pool_pcg_put(&pc->pc_fullgroups, prev);
cc->cc_nfull++;
} else if (prev->pcg_avail == 0) {
(void)pool_pcg_put(pc->pc_pcgcache, prev);
} else {
(void)pool_pcg_put(&pc->pc_partgroups, prev);
cc->cc_npart++;
}
}
splx(s);
}
static int
pool_bigidx(size_t size)
{
int i;
for (i = 0; i < __arraycount(pool_allocator_big); i++) {
if (1 << (i + POOL_ALLOCATOR_BIG_BASE) >= size)
return i;
}
panic("pool item size %zu too large, use a custom allocator", size);
}
static void *
pool_allocator_alloc(struct pool *pp, int flags)
{
struct pool_allocator *pa = pp->pr_alloc;
void *res;
if (pp->pr_redzone) {
KASSERT(!pp_has_pser(pp));
kasan_mark(v, pa->pa_pagesz, pa->pa_pagesz, 0);
} else if (__predict_false(pp_has_pser(pp))) {
/*
* Perform a passive serialization barrier before freeing
* the pool page back to the system.
*/
pool_barrier();
}
(*pa->pa_free)(pp, v);
}
void *
pool_page_alloc(struct pool *pp, int flags)
{
const vm_flag_t vflags = (flags & PR_WAITOK) ? VM_SLEEP: VM_NOSLEEP;
vmem_addr_t va;
int ret;
ret = uvm_km_kmem_alloc(kmem_va_arena, pp->pr_alloc->pa_pagesz,
vflags | VM_INSTANTFIT, &va);
if (pp->pr_roflags & PR_NOTOUCH) {
pp->pr_redzone = false;
return;
}
/*
* We may have extended the requested size earlier; check if
* there's naturally space in the padding for a red zone.
*/
if (pp->pr_size - requested_size >= redzsz) {
pp->pr_reqsize_with_redzone = requested_size + redzsz;
pp->pr_redzone = true;
return;
}
/*
* No space in the natural padding; check if we can extend a
* bit the size of the pool.
*
* Avoid using redzone for allocations half of a page or larger.
* For pagesize items, we'd waste a whole new page (could be
* unmapped?), and for half pagesize items, approximately half
* the space is lost (eg, 4K pages, you get one 2K allocation.)
*/
nsz = roundup(pp->pr_size + redzsz, pp->pr_align);
if (nsz <= (pp->pr_alloc->pa_pagesz / 2)) {
/* Ok, we can */
pp->pr_size = nsz;
pp->pr_reqsize_with_redzone = requested_size + redzsz;
pp->pr_redzone = true;
} else {
/* No space for a red zone... snif :'( */
pp->pr_redzone = false;
aprint_debug("pool redzone disabled for '%s'\n", pp->pr_wchan);
}
}
/*
* We really don't want the first byte of the red zone to be '\0';
* an off-by-one in a string may not be properly detected.
*/
pat = pool_pattern_generate(cp);
*cp = (pat == '\0') ? STATIC_BYTE: pat;
cp++;
static void
pool_cache_redzone_check(pool_cache_t pc, void *p)
{
#ifdef KASAN
/*
* If there is a ctor/dtor, or if the cache objects use
* passive serialization, leave the data as valid.
*/
if (__predict_false(pc_has_ctor(pc) || pc_has_dtor(pc) ||
pc_has_pser(pc))) {
return;
}
#endif
pool_redzone_check(&pc->pc_pool, p);
}
static int
pool_sysctl(SYSCTLFN_ARGS)
{
struct pool_sysctl data;
struct pool *pp;
struct pool_cache *pc;
pool_cache_cpu_t *cc;
int error;
size_t i, written;