/* $NetBSD$ */
/*-
* Copyright (c) 2009 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Cherry G. Mathew <
[email protected]>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* TODO:
xen.balloon
xen.balloon.current: DONE
xen.balloon.target: IN PROGRESS
xen.balloon.low-balloon
xen.balloon.high-balloon
xen.balloon.limit
sysctl labels = { 'current' : 'Current allocation',
'target' : 'Requested target',
'low-balloon' : 'Low-mem balloon',
'high-balloon' : 'High-mem balloon',
'limit' : 'Xen hard limit' }
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD$");
#include "opt_balloon.h"
#include <sys/param.h>
#include <sys/condvar.h>
#include <sys/kernel.h>
#include <sys/kmem.h>
#include <sys/kthread.h>
#include <sys/mutex.h>
#include <sys/queue.h>
#include <sys/sysctl.h>
#include <xen/xen.h>
#include <xen/xenbus.h>
#include <xen/balloon.h>
#include <uvm/uvm.h>
#include <uvm/uvm_extern.h>
#include <xen/xenpmap.h>
#define BALLOONINTERVALMS 100 /* milliseconds */
/* XXX: fix limits */
#define LOW_BALLOON 100 /* In pages */
#define HIGH_BALLOON SIZE_T_MAX /* In pages */
/* Forward declaration */
static void xenbus_balloon_watcher(struct xenbus_watch *watch, const char **vec,
unsigned int len);
struct balloon_page_entry {
struct vm_page *pg;
SLIST_ENTRY(balloon_page_entry) entry;
};
static struct balloon_conf {
kmutex_t flaglock; /* Protects condvar (below) */
kcondvar_t cv_memchanged; /* Notifier flag for target (below) */
kmutex_t tgtlock; /* Spin lock, protects .target, below */
size_t target; /* Target balloon size, in pages. */
SLIST_HEAD(, balloon_page_entry) balloon_page_entries;
} balloon_conf;
static struct xenbus_watch xenbus_balloon_watch = {
.node = __UNCONST("memory/target"),
.xbw_callback = xenbus_balloon_watcher,
};
/* Returns zero, on error */
static size_t
xenmem_get_maxreservation(void)
{
int ret;
ret = HYPERVISOR_memory_op(XENMEM_maximum_reservation,
& (domid_t) { DOMID_SELF });
if (ret < 0) {
/* XXX: panic() ? */
return 0;
}
return ret;
}
/* Returns zero, on error */
static size_t
xenmem_get_currentreservation(void)
{
int ret;
ret = HYPERVISOR_memory_op(XENMEM_current_reservation,
& (domid_t) { DOMID_SELF });
if (ret < 0) {
/* XXX: panic() ? */
return 0;
}
return ret;
}
static size_t
balloon_get_target(void)
{
size_t target;
mutex_spin_enter(&balloon_conf.tgtlock);
target = balloon_conf.target;
mutex_spin_exit(&balloon_conf.tgtlock);
return target;
}
static void
balloon_set_target(size_t target)
{
mutex_spin_enter(&balloon_conf.tgtlock);
balloon_conf.target = target;
mutex_spin_exit(&balloon_conf.tgtlock);
return;
}
static size_t
reserve_pages(size_t npages, xen_pfn_t *mfn_list)
{
struct balloon_page_entry *bpg_entry;
size_t newpgcount;
paddr_t pfn;
for (newpgcount = 0; newpgcount < npages; newpgcount++) {
struct vm_page *pg;
pg = uvm_pagealloc(NULL, 0, NULL,
UVM_PGA_ZERO);
if (pg == NULL) {
break;
}
pfn = x86_btop(VM_PAGE_TO_PHYS(pg) - XPMAP_OFFSET);
mfn_list[newpgcount] = pfn_to_mfn(pfn);
/* Invalidate pg */
xpmap_phys_to_machine_mapping[pfn] = INVALID_P2M_ENTRY;
/* Save mfn */
/*
* XXX: We don't keep a copy, but just save a pointer
* to the uvm pg handle. Is this ok ?
*/
bpg_entry = kmem_alloc(sizeof *bpg_entry, KM_SLEEP);
if (bpg_entry == NULL) {
uvm_pagefree(pg);
break;
}
bpg_entry->pg = pg;
SLIST_INSERT_HEAD(&balloon_conf.balloon_page_entries,
bpg_entry, entry);
}
return newpgcount;
}
static size_t
unreserve_pages(size_t ret, xen_pfn_t *mfn_list)
{
size_t npages;
for (npages = 0; npages < ret; npages++) {
struct balloon_page_entry *bpg_entry;
struct vm_page *pg;
paddr_t pfn;
int tmp;
if (SLIST_EMPTY(&balloon_conf.balloon_page_entries)) {
/*XXX: This is the case where extra mem w.r.t boot comes in ? */
printf("Balloon is empty. can't be collapsed further!");
/*XXX: mark down target ? */
break;
}
bpg_entry = SLIST_FIRST(&balloon_conf.balloon_page_entries);
SLIST_REMOVE_HEAD(&balloon_conf.balloon_page_entries, entry);
pg = bpg_entry->pg;
kmem_free(bpg_entry, sizeof *bpg_entry);
/* Update P->M */
pfn = x86_btop(VM_PAGE_TO_PHYS(pg) - XPMAP_OFFSET);
xpmap_phys_to_machine_mapping[pfn] = mfn_list[npages];
/* Update the MMU */
mmu_update_t mmu;
mmu.ptr = x86_ptob(mfn_list[npages]) | MMU_MACHPHYS_UPDATE;
mmu.val = pfn;
if (HYPERVISOR_mmu_update(&mmu, 1, &tmp, DOMID_SELF) < 0) {
panic("MMU Update failed!");
}
/* Free it to UVM */
uvm_pagefree(pg);
}
return npages;
}
static void
balloon_inflate(size_t npages)
{
int ret;
size_t respgcnt;
xen_pfn_t *mfn_list;
struct xen_memory_reservation reservation = {
.address_bits = 0,
.extent_order = 0,
.domid = DOMID_SELF
};
/*
* There's a risk that npages might overflow ret.
* Do this is smaller steps then.
* See: HYPERVISOR_memory_op(...) below....
*/
if (npages > INT_MAX) {
npages = INT_MAX;
}
mfn_list = kmem_alloc(npages * sizeof *mfn_list,
KM_SLEEP);
if (mfn_list == NULL) {
printf("%s: Error, could not allocate kernel memory",
__FILE__);
return;
}
respgcnt = reserve_pages(npages, mfn_list);
/* Hand over pages to Hypervisor */
xenguest_handle(reservation.extent_start) = mfn_list;
reservation.nr_extents = respgcnt;
ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
if (ret < 0) {
/* Unroll loop and release page frames back to the OS. */
unreserve_pages(respgcnt, mfn_list);
return;
}
KASSERT(ret == npages);
kmem_free(mfn_list, npages * sizeof *mfn_list);
printf("inflated by %d\n", ret);
return;
}
static void
balloon_deflate(size_t npages)
{
int ret;
size_t pgmax;
xen_pfn_t *mfn_list;
struct xen_memory_reservation reservation = {
.address_bits = 0,
.extent_order = 0,
.domid = DOMID_SELF
};
/*
* There's a risk that npages might overflow ret.
* Do this is smaller steps then.
* See: HYPERVISOR_memory_op(...) below....
*/
if (npages > INT_MAX) {
npages = INT_MAX;
}
/* XXX: This is wrong. npages is the _delta_.
* Trim npages, if its exceeded the hard limit
*/
if ((pgmax = xenmem_get_maxreservation()) > 0) {
pgmax -= xenmem_get_currentreservation();
}
if (npages > pgmax && pgmax > 0) {
npages = pgmax;
}
mfn_list = kmem_alloc(npages * sizeof *mfn_list, KM_SLEEP);
if (mfn_list == NULL) {
printf("%s: Error, could not allocate kernel memory",
__FILE__);
return;
}
xenguest_handle(reservation.extent_start) = mfn_list;
reservation.nr_extents = npages;
ret = HYPERVISOR_memory_op(XENMEM_increase_reservation, &reservation);
if (ret <= 0) {
panic("Increase reservation failed");
/* NOTREACHED */
return;
}
npages = unreserve_pages(ret, mfn_list);
printf("deflated by %zu\n", npages);
return;
}
static size_t
balloon_resize(size_t targetpages)
{
size_t currentpages;
/* Get current number of pages */
currentpages = xenmem_get_currentreservation();
KASSERT(currentpages > 0);
if (targetpages == currentpages) {
return currentpages;
}
#if 0
printf("Current pages == %zu\n", currentpages);
#endif
/* Increase or decrease, accordingly */
if (targetpages > currentpages) {
balloon_deflate(targetpages - currentpages);
}
else {
balloon_inflate(currentpages - targetpages);
}
/* Get the new, adjusted number of pages. */
currentpages = xenmem_get_currentreservation();
KASSERT(currentpages > 0);
yield();
return currentpages;
}
static void
balloon_thread(void *ignore)
{
size_t targetinprogress;
const int pollticks = mstohz(BALLOONINTERVALMS);
/*
* Get target. This will ensure that the wait loop (below)
* won't break out until the target is set properly for the
* first time. The value of targetinprogress is probably
* rubbish.
*/
targetinprogress = balloon_get_target();
for/*ever*/ ( ;; ) {
size_t tgtcache;
mutex_enter(&balloon_conf.flaglock);
while (balloon_get_target() == targetinprogress) {
cv_timedwait(&balloon_conf.cv_memchanged, &balloon_conf.flaglock, pollticks);
}
tgtcache = balloon_get_target();
#if 0
printf("new target ==> %zu\n", tgtcache);
#endif
targetinprogress = balloon_resize(tgtcache);
mutex_exit(&balloon_conf.flaglock);
}
}
static size_t
xenbus_balloon_read_target(void)
{
unsigned long long new_target;
if (0 != xenbus_read_ull(NULL, "memory", "target", &new_target, 0)) {
printf("error, couldn't read\n");
return 0;
}
/* Convert to npages */
return new_target * 1024 / PAGE_SIZE;
}
static void
xenbus_balloon_watcher(struct xenbus_watch *watch, const char **vec,
unsigned int len)
{
unsigned long long new_target;
if (0 == (new_target = xenbus_balloon_read_target())) {
/* Don't update target value */
return;
}
balloon_set_target(new_target);
printf("Setting target to %llu\n", new_target);
printf("Current reservation is %zu\n", xenmem_get_currentreservation());
/* Notify balloon thread, if we can. */
if (mutex_tryenter(&balloon_conf.flaglock)) {
cv_signal(&balloon_conf.cv_memchanged);
mutex_exit(&balloon_conf.flaglock);
}
return;
}
void
balloon_xenbus_setup(void)
{
#ifdef XEN_BALLOON
/* Setup flaglocks, condvars et. al */
mutex_init(&balloon_conf.flaglock, MUTEX_DEFAULT, IPL_NONE);
mutex_init(&balloon_conf.tgtlock, MUTEX_DEFAULT, IPL_HIGH);
cv_init(&balloon_conf.cv_memchanged, "ballooning");
SLIST_INIT(&balloon_conf.balloon_page_entries);
/* Setup xenbus node watch callback */
if (register_xenbus_watch(&xenbus_balloon_watch)) {
aprint_error("%s: unable to watch memory/target\n", __func__);
cv_destroy(&balloon_conf.cv_memchanged);
mutex_destroy(&balloon_conf.tgtlock);
mutex_destroy(&balloon_conf.flaglock);
}
/* Setup kernel thread to asynchronously (in/de)-flate the balloon */
if (kthread_create(PRI_NONE, KTHREAD_MPSAFE, NULL, balloon_thread, NULL /* arg */,
NULL, "balloon")) {
aprint_error("%s: unable to create balloon thread\n", __func__);
unregister_xenbus_watch(&xenbus_balloon_watch);
cv_destroy(&balloon_conf.cv_memchanged);
mutex_destroy(&balloon_conf.tgtlock);
mutex_destroy(&balloon_conf.flaglock);
}
#endif
return;
}
/*
* sysctl(9) stuff
*/
/* sysctl helper routine */
static int
sysctl_kern_xen_balloon(SYSCTLFN_ARGS)
{
struct sysctlnode node;
/*
* Assumes SIZE_T_MAX <= ((uint64_t) -1) see createv() in
* SYSCTL_SETUP(...) below
*/
int error;
int64_t node_val;
int64_t newnode_val;
KASSERT(rnode != NULL);
node = *rnode;
if (strcmp(node.sysctl_name, "current") == 0) {
node_val = xenmem_get_currentreservation();
KASSERT(node_val < SIZE_T_MAX);
node.sysctl_data = &node_val;
return sysctl_lookup(SYSCTLFN_CALL(&node));
} else if (strcmp(node.sysctl_name, "target") == 0) {
newnode_val = node_val = balloon_get_target();
node.sysctl_data = &newnode_val;
error = sysctl_lookup(SYSCTLFN_CALL(&node));
if (error != 0) {
return error;
}
/* Sanity check new size */
/* if (newnode_val <= LOW_BALLOON */
/* || newnode_val > HIGH_BALLOON) { */
/* return EINVAL; */
/* } */
KASSERT(node_val < SIZE_T_MAX);
if (node_val != newnode_val) {
// * (int64_t *) rnode->sysctl_data = newnode_val;
printf("setting to %qd", newnode_val);
balloon_set_target(newnode_val);
}
return 0;
}
return EINVAL;
}
/* Setup nodes. */
SYSCTL_SETUP(sysctl_kern_xen_balloon_setup, "sysctl kern.xen.balloon setup")
{
const struct sysctlnode *node = NULL;
sysctl_createv(clog, 0, NULL, &node,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "kern", NULL,
NULL, 0, NULL, 0,
CTL_KERN, CTL_EOL);
/* XXX: if (node != NULL) */
sysctl_createv(clog, 0, &node, &node,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "xen",
SYSCTL_DESCR("Xen"),
NULL, 0, NULL, 0,
CTL_CREATE, CTL_EOL);
sysctl_createv(clog, 0, &node, &node,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "balloon",
SYSCTL_DESCR("Balloon"),
NULL, 0, NULL, 0,
CTL_CREATE, CTL_EOL);
sysctl_createv(clog, 0, &node, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_QUAD, "current",
SYSCTL_DESCR("Current balloon size"),
sysctl_kern_xen_balloon, 0, NULL, 0,
CTL_CREATE, CTL_EOL);
sysctl_createv(clog, 0, &node, NULL,
CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
CTLTYPE_QUAD, "target",
SYSCTL_DESCR("Target balloon size"),
sysctl_kern_xen_balloon, 0, NULL, 0,
CTL_CREATE, CTL_EOL);
}