/* $NetBSD$ */

/*-
* Copyright (c) 2009 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Cherry G. Mathew <[email protected]>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
*    notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
*    notice, this list of conditions and the following disclaimer in the
*    documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/

/*
* TODO:
xen.balloon
xen.balloon.current: DONE
xen.balloon.target: IN PROGRESS
xen.balloon.low-balloon
xen.balloon.high-balloon
xen.balloon.limit

sysctl labels = { 'current'      : 'Current allocation',
          'target'       : 'Requested target',
          'low-balloon'  : 'Low-mem balloon',
          'high-balloon' : 'High-mem balloon',
          'limit'        : 'Xen hard limit' }

*/

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD$");

#include "opt_balloon.h"

#include <sys/param.h>

#include <sys/condvar.h>
#include <sys/kernel.h>
#include <sys/kmem.h>
#include <sys/kthread.h>
#include <sys/mutex.h>
#include <sys/queue.h>
#include <sys/sysctl.h>

#include <xen/xen.h>
#include <xen/xenbus.h>
#include <xen/balloon.h>

#include <uvm/uvm.h>
#include <uvm/uvm_extern.h>
#include <xen/xenpmap.h>

#define BALLOONINTERVALMS 100 /* milliseconds */
/* XXX: fix limits */
#define LOW_BALLOON 100 /* In pages */
#define HIGH_BALLOON SIZE_T_MAX /* In pages */

/* Forward declaration */
static void xenbus_balloon_watcher(struct xenbus_watch *watch, const char **vec,
                                  unsigned int len);

struct balloon_page_entry {
       struct vm_page *pg;
       SLIST_ENTRY(balloon_page_entry) entry;
};

static struct balloon_conf {
       kmutex_t flaglock; /* Protects condvar (below) */
       kcondvar_t cv_memchanged; /* Notifier flag for target (below) */

       kmutex_t tgtlock; /* Spin lock, protects .target, below */
       size_t target; /* Target balloon size, in pages. */

       SLIST_HEAD(, balloon_page_entry) balloon_page_entries;

} balloon_conf;

static struct xenbus_watch xenbus_balloon_watch = {
       .node = __UNCONST("memory/target"),
       .xbw_callback = xenbus_balloon_watcher,
};


/* Returns zero, on error */
static size_t
xenmem_get_maxreservation(void)
{
       int ret;
       ret = HYPERVISOR_memory_op(XENMEM_maximum_reservation,
                                  & (domid_t) { DOMID_SELF });

       if (ret < 0) {
               /* XXX: panic() ? */
               return 0;
       }

       return ret;

}

/* Returns zero, on error */
static size_t
xenmem_get_currentreservation(void)
{
       int ret;

       ret = HYPERVISOR_memory_op(XENMEM_current_reservation,
                                  & (domid_t) { DOMID_SELF });

       if (ret < 0) {
               /* XXX: panic() ? */
               return 0;
       }

       return ret;

}

static size_t
balloon_get_target(void)
{
       size_t target;

       mutex_spin_enter(&balloon_conf.tgtlock);
       target = balloon_conf.target;
       mutex_spin_exit(&balloon_conf.tgtlock);

       return target;

}

static void
balloon_set_target(size_t target)
{

       mutex_spin_enter(&balloon_conf.tgtlock);
       balloon_conf.target = target;
       mutex_spin_exit(&balloon_conf.tgtlock);

       return;

}

static size_t
reserve_pages(size_t npages, xen_pfn_t *mfn_list)
{


       struct balloon_page_entry *bpg_entry;
       size_t newpgcount;
       paddr_t pfn;

       for (newpgcount = 0; newpgcount < npages; newpgcount++) {
               struct vm_page *pg;

               pg = uvm_pagealloc(NULL, 0, NULL,
                                  UVM_PGA_ZERO);

               if (pg == NULL) {
                       break;
               }

               pfn = x86_btop(VM_PAGE_TO_PHYS(pg) - XPMAP_OFFSET);
               mfn_list[newpgcount] = pfn_to_mfn(pfn);

               /* Invalidate pg */
               xpmap_phys_to_machine_mapping[pfn] = INVALID_P2M_ENTRY;

               /* Save mfn */
               /*
                * XXX: We don't keep a copy, but just save a pointer
                * to the uvm pg handle. Is this ok ?
                */

               bpg_entry = kmem_alloc(sizeof *bpg_entry, KM_SLEEP);

               if (bpg_entry == NULL) {
                       uvm_pagefree(pg);
                       break;
               }

               bpg_entry->pg = pg;

               SLIST_INSERT_HEAD(&balloon_conf.balloon_page_entries,
                                 bpg_entry, entry);
       }

       return newpgcount;
}

static size_t
unreserve_pages(size_t ret, xen_pfn_t *mfn_list)
{

       size_t npages;

       for (npages = 0; npages < ret; npages++) {
               struct balloon_page_entry *bpg_entry;
               struct vm_page *pg;
               paddr_t pfn;
               int tmp;

               if (SLIST_EMPTY(&balloon_conf.balloon_page_entries)) {
                       /*XXX: This is the case where extra mem w.r.t boot comes in ? */
                       printf("Balloon is empty. can't be collapsed further!");
                       /*XXX: mark down target ? */
                       break;
               }

               bpg_entry = SLIST_FIRST(&balloon_conf.balloon_page_entries);
               SLIST_REMOVE_HEAD(&balloon_conf.balloon_page_entries, entry);

               pg = bpg_entry->pg;

               kmem_free(bpg_entry, sizeof *bpg_entry);


               /* Update P->M */
               pfn = x86_btop(VM_PAGE_TO_PHYS(pg) - XPMAP_OFFSET);

               xpmap_phys_to_machine_mapping[pfn] = mfn_list[npages];


               /* Update the MMU */
               mmu_update_t mmu;
               mmu.ptr = x86_ptob(mfn_list[npages]) | MMU_MACHPHYS_UPDATE;
               mmu.val = pfn;

               if (HYPERVISOR_mmu_update(&mmu, 1, &tmp, DOMID_SELF) < 0) {
                       panic("MMU Update failed!");
               }

               /* Free it to UVM */
               uvm_pagefree(pg);

       }

       return npages;
}

static void
balloon_inflate(size_t npages)
{


       int ret;
       size_t respgcnt;

       xen_pfn_t *mfn_list;

       struct xen_memory_reservation reservation = {
               .address_bits = 0,
               .extent_order = 0,
               .domid        = DOMID_SELF
       };

       /*
        * There's a risk that npages might overflow ret.
        * Do this is smaller steps then.
        * See: HYPERVISOR_memory_op(...) below....
        */

       if (npages > INT_MAX) {
               npages = INT_MAX;
       }

       mfn_list = kmem_alloc(npages * sizeof *mfn_list,
                             KM_SLEEP);

       if (mfn_list == NULL) {
               printf("%s: Error, could not allocate kernel memory",
                      __FILE__);
               return;
       }

       respgcnt = reserve_pages(npages, mfn_list);

       /* Hand over pages to Hypervisor */
       xenguest_handle(reservation.extent_start) = mfn_list;
       reservation.nr_extents = respgcnt;

       ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);

       if (ret < 0) {
               /* Unroll loop and release page frames back to the OS. */
               unreserve_pages(respgcnt, mfn_list);
               return;
       }

       KASSERT(ret == npages);

       kmem_free(mfn_list, npages * sizeof *mfn_list);
       printf("inflated by %d\n", ret);
       return;

}

static void
balloon_deflate(size_t npages)
{

       int ret;
       size_t pgmax;
       xen_pfn_t *mfn_list;

       struct xen_memory_reservation reservation = {
               .address_bits = 0,
               .extent_order = 0,
               .domid        = DOMID_SELF
       };

       /*
        * There's a risk that npages might overflow ret.
        * Do this is smaller steps then.
        * See: HYPERVISOR_memory_op(...) below....
        */

       if (npages > INT_MAX) {
               npages = INT_MAX;
       }

       /* XXX: This is wrong. npages is the _delta_.
        * Trim npages, if its exceeded the hard limit
        */
       if ((pgmax = xenmem_get_maxreservation()) > 0) {
               pgmax -= xenmem_get_currentreservation();
       }

       if (npages > pgmax && pgmax > 0) {
               npages = pgmax;
       }

       mfn_list = kmem_alloc(npages * sizeof *mfn_list, KM_SLEEP);

       if (mfn_list == NULL) {
               printf("%s: Error, could not allocate kernel memory",
                      __FILE__);
               return;
       }

       xenguest_handle(reservation.extent_start) = mfn_list;
       reservation.nr_extents = npages;

       ret = HYPERVISOR_memory_op(XENMEM_increase_reservation, &reservation);

       if (ret <= 0) {
               panic("Increase reservation failed");
               /* NOTREACHED */
               return;
       }

       npages = unreserve_pages(ret, mfn_list);

       printf("deflated by %zu\n", npages);

       return;

}

static size_t
balloon_resize(size_t targetpages)
{

       size_t currentpages;

       /* Get current number of pages */
       currentpages = xenmem_get_currentreservation();

       KASSERT(currentpages > 0);

       if (targetpages == currentpages) {
               return currentpages;
       }

#if 0
       printf("Current pages == %zu\n", currentpages);
#endif

       /* Increase or decrease, accordingly */
       if (targetpages > currentpages) {
               balloon_deflate(targetpages - currentpages);
       }
       else {
               balloon_inflate(currentpages - targetpages);
       }

       /* Get the new, adjusted number of pages. */
       currentpages = xenmem_get_currentreservation();

       KASSERT(currentpages > 0);

       yield();

       return currentpages;
}

static void
balloon_thread(void *ignore)
{

       size_t targetinprogress;
       const int pollticks = mstohz(BALLOONINTERVALMS);

       /*
        * Get target. This will ensure that the wait loop (below)
        * won't break out until the target is set properly for the
        * first time. The value of targetinprogress is probably
        * rubbish.
        */
       targetinprogress = balloon_get_target();

       for/*ever*/ ( ;; ) {

               size_t tgtcache;

               mutex_enter(&balloon_conf.flaglock);

               while (balloon_get_target() == targetinprogress) {
                       cv_timedwait(&balloon_conf.cv_memchanged, &balloon_conf.flaglock, pollticks);
               }
               tgtcache = balloon_get_target();

#if 0
               printf("new target ==> %zu\n", tgtcache);
#endif
               targetinprogress = balloon_resize(tgtcache);

               mutex_exit(&balloon_conf.flaglock);

       }

}

static size_t
xenbus_balloon_read_target(void)
{
       unsigned long long new_target;

       if (0 != xenbus_read_ull(NULL, "memory", "target", &new_target, 0)) {
               printf("error, couldn't read\n");
               return 0;
       }

       /* Convert to npages */

       return new_target * 1024 / PAGE_SIZE;
}

static void
xenbus_balloon_watcher(struct xenbus_watch *watch, const char **vec,
                      unsigned int len)
{
       unsigned long long new_target;

       if (0 == (new_target = xenbus_balloon_read_target())) {
               /* Don't update target value */
               return;
       }
       balloon_set_target(new_target);
       printf("Setting target to %llu\n", new_target);
       printf("Current reservation is %zu\n", xenmem_get_currentreservation());

       /* Notify balloon thread, if we can. */
       if (mutex_tryenter(&balloon_conf.flaglock)) {
               cv_signal(&balloon_conf.cv_memchanged);
               mutex_exit(&balloon_conf.flaglock);
       }

       return;
}

void
balloon_xenbus_setup(void)
{

#ifdef XEN_BALLOON

       /* Setup flaglocks, condvars et. al */
       mutex_init(&balloon_conf.flaglock, MUTEX_DEFAULT, IPL_NONE);
       mutex_init(&balloon_conf.tgtlock, MUTEX_DEFAULT, IPL_HIGH);
       cv_init(&balloon_conf.cv_memchanged, "ballooning");

       SLIST_INIT(&balloon_conf.balloon_page_entries);

       /* Setup xenbus node watch callback */
       if (register_xenbus_watch(&xenbus_balloon_watch)) {
               aprint_error("%s: unable to watch memory/target\n", __func__);
               cv_destroy(&balloon_conf.cv_memchanged);
               mutex_destroy(&balloon_conf.tgtlock);
               mutex_destroy(&balloon_conf.flaglock);
       }

       /* Setup kernel thread to asynchronously (in/de)-flate the balloon */
       if (kthread_create(PRI_NONE, KTHREAD_MPSAFE, NULL, balloon_thread, NULL /* arg */,
                          NULL, "balloon")) {
               aprint_error("%s: unable to create balloon thread\n", __func__);
               unregister_xenbus_watch(&xenbus_balloon_watch);
               cv_destroy(&balloon_conf.cv_memchanged);
               mutex_destroy(&balloon_conf.tgtlock);
               mutex_destroy(&balloon_conf.flaglock);
       }

#endif
       return;

}


/*
* sysctl(9) stuff
*/

/* sysctl helper routine */
static int
sysctl_kern_xen_balloon(SYSCTLFN_ARGS)
{

       struct sysctlnode node;

       /*
        * Assumes SIZE_T_MAX <= ((uint64_t) -1) see createv() in
        * SYSCTL_SETUP(...) below
        */

       int error;
       int64_t node_val;
       int64_t newnode_val;

       KASSERT(rnode != NULL);
       node = *rnode;

       if (strcmp(node.sysctl_name, "current") == 0) {
               node_val = xenmem_get_currentreservation();
               KASSERT(node_val < SIZE_T_MAX);
               node.sysctl_data = &node_val;
               return sysctl_lookup(SYSCTLFN_CALL(&node));

       } else if (strcmp(node.sysctl_name, "target") == 0) {

               newnode_val = node_val = balloon_get_target();
               node.sysctl_data = &newnode_val;
               error = sysctl_lookup(SYSCTLFN_CALL(&node));
               if (error != 0) {
                       return error;
               }

               /* Sanity check new size */
/*              if (newnode_val <= LOW_BALLOON */
/*                  || newnode_val > HIGH_BALLOON) { */
/*                      return EINVAL; */
/*              } */

               KASSERT(node_val < SIZE_T_MAX);

               if (node_val != newnode_val) {
//                      * (int64_t *) rnode->sysctl_data = newnode_val;
                       printf("setting to %qd", newnode_val);
                       balloon_set_target(newnode_val);
               }

               return 0;
       }

       return EINVAL;
}

/* Setup nodes. */
SYSCTL_SETUP(sysctl_kern_xen_balloon_setup, "sysctl kern.xen.balloon setup")
{
       const struct sysctlnode *node = NULL;

       sysctl_createv(clog, 0, NULL, &node,
                      CTLFLAG_PERMANENT,
                      CTLTYPE_NODE, "kern", NULL,
                      NULL, 0, NULL, 0,
                      CTL_KERN, CTL_EOL);

       /* XXX: if (node != NULL) */
       sysctl_createv(clog, 0, &node, &node,
                      CTLFLAG_PERMANENT,
                      CTLTYPE_NODE, "xen",
                      SYSCTL_DESCR("Xen"),
                      NULL, 0, NULL, 0,
                      CTL_CREATE, CTL_EOL);

       sysctl_createv(clog, 0, &node, &node,
                      CTLFLAG_PERMANENT,
                      CTLTYPE_NODE, "balloon",
                      SYSCTL_DESCR("Balloon"),
                      NULL, 0, NULL, 0,
                      CTL_CREATE, CTL_EOL);

       sysctl_createv(clog, 0, &node, NULL,
                      CTLFLAG_PERMANENT,
                      CTLTYPE_QUAD, "current",
                      SYSCTL_DESCR("Current balloon size"),
                      sysctl_kern_xen_balloon, 0, NULL, 0,
                      CTL_CREATE, CTL_EOL);

       sysctl_createv(clog, 0, &node, NULL,
                      CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
                      CTLTYPE_QUAD, "target",
                      SYSCTL_DESCR("Target balloon size"),
                      sysctl_kern_xen_balloon, 0, NULL, 0,
                      CTL_CREATE, CTL_EOL);

}