/* proc is free now, make sure unlock() wont touch it */
up = procalloc.Lock.p = nil;
unlock(&procalloc);
sched();
}
coherence();
up->mach = nil;
up = nil;
}
sched();
}
/*
* If changing this routine, look also at sleep(). It
* contains a copy of the guts of sched().
*/
void
sched(void)
{
Proc *p;
if(m->ilockdepth)
panic("cpu%d: ilockdepth %d, last lock %#p at %#p, sched called from %#p",
m->machno,
m->ilockdepth,
up != nil ? up->lastilock: nil,
(up != nil && up->lastilock != nil) ? up->lastilock->pc: 0,
getcallerpc(&p+2));
if(up != nil) {
/*
* Delay the sched until the process gives up the locks
* it is holding. This avoids dumb lock loops.
* Don't delay if the process is Moribund.
* It called sched to die.
* But do sched eventually. This avoids a missing unlock
* from hanging the entire kernel.
* But don't reschedule procs holding palloc or procalloc.
* Those are far too important to be holding while asleep.
*
* This test is not exact. There can still be a few instructions
* in the middle of taslock when a process holds a lock
* but Lock.p has not yet been initialized.
*/
if(up->nlocks)
if(up->state != Moribund)
if(up->delaysched < 20
|| palloc.Lock.p == up
|| fscache.Lock.p == up
|| procalloc.Lock.p == up){
up->delaysched++;
delayedscheds++;
return;
}
up->delaysched = 0;
int
anyhigher(void)
{
return runvec & ~((1<<(up->priority+1))-1);
}
/*
* here once per clock tick to see if we should resched
*/
void
hzsched(void)
{
/* once a second, rebalance will reprioritize ready procs */
if(m->machno == 0)
rebalance();
/* unless preempted, get to run for at least 100ms */
if(anyhigher()
|| (!up->fixedpri && (long)(m->ticks - m->schedticks) > 0 && anyready())){
m->readied = nil; /* avoid cooperative scheduling */
up->delaysched++;
}
}
/*
* here at the end of non-clock interrupts to see if we should preempt the
* current process. Returns 1 if preempted, 0 otherwise.
*/
int
preempted(void)
{
if(up != nil && up->state == Running)
if(up->preempted == 0)
if(anyhigher())
if(!active.exiting){
m->readied = nil; /* avoid cooperative scheduling */
up->preempted = 1;
sched();
splhi();
up->preempted = 0;
return 1;
}
return 0;
}
/*
* Update the cpu time average for this particular process,
* which is about to change from up -> not up or vice versa.
* p->lastupdate is the last time an updatecpu happened.
*
* The cpu time average is a decaying average that lasts
* about D clock ticks. D is chosen to be approximately
* the cpu time of a cpu-intensive "quick job". A job has to run
* for approximately D clock ticks before we home in on its
* actual cpu usage. Thus if you manage to get in and get out
* quickly, you won't be penalized during your burst. Once you
* start using your share of the cpu for more than about D
* clock ticks though, your p->cpu hits 1000 (1.0) and you end up
* below all the other quick jobs. Interactive tasks, because
* they basically always use less than their fair share of cpu,
* will be rewarded.
*
* If the process has not been running, then we want to
* apply the filter
*
* cpu = cpu * (D-1)/D
*
* n times, yielding
*
* cpu = cpu * ((D-1)/D)^n
*
* but D is big enough that this is approximately
*
* cpu = cpu * (D-n)/D
*
* so we use that instead.
*
* If the process has been running, we apply the filter to
* 1 - cpu, yielding a similar equation. Note that cpu is
* stored in fixed point (* 1000).
*
* Updatecpu must be called before changing up, in order
* to maintain accurate cpu usage statistics. It can be called
* at any time to bring the stats for a given proc up-to-date.
*/
void
updatecpu(Proc *p)
{
ulong t, ocpu, n, D;
if(p->edf != nil)
return;
t = MACHP(0)->ticks*Scaling + Scaling/2;
n = t - p->lastupdate;
if(n == 0)
return;
p->lastupdate = t;
D = schedgain*HZ*Scaling;
if(n > D)
n = D;
ocpu = p->cpu;
if(p != up)
p->cpu = (ocpu*(D-n))/D;
else{
t = 1000 - ocpu;
t = (t*(D-n))/D;
p->cpu = 1000 - t;
}
//iprint("pid %lud %s for %lud cpu %lud -> %lud\n", p->pid,p==up?"active":"inactive",n, ocpu,p->cpu);
}
/*
* On average, p has used p->cpu of a cpu recently.
* Its fair share is conf.nmach/m->load of a cpu. If it has been getting
* too much, penalize it. If it has been getting not enough, reward it.
* I don't think you can get much more than your fair share that
* often, so most of the queues are for using less. Having a priority
* of 3 means you're just right. Having a higher priority (up to p->basepri)
* means you're not using as much as you could.
*/
int
reprioritize(Proc *p)
{
int fairshare, n, load, ratio;
/*
* fairshare = 1.000 * conf.nmach * 1.000/load,
* except the decimal point is moved three places
* on both load and fairshare.
*/
fairshare = (conf.nmach*1000*1000)/load;
n = p->cpu;
if(n == 0)
n = 1;
ratio = (fairshare+n/2) / n;
if(ratio > p->basepri)
ratio = p->basepri;
if(ratio < 0)
panic("reprioritize");
//iprint("pid %lud cpu %lud load %d fair %d pri %d\n", p->pid, p->cpu, load, fairshare, ratio);
return ratio;
}
/*
* add a process to a scheduling queue
*/
void
queueproc(Schedq *rq, Proc *p)
{
int pri;
/*
* try to remove a process from a scheduling queue (called splhi)
*/
Proc*
dequeueproc(Schedq *rq, Proc *tp)
{
Proc *l, *p;
if(!canlock(runq))
return nil;
/*
* the queue may have changed before we locked runq,
* refind the target process.
*/
l = nil;
for(p = rq->head; p != nil; p = p->rnext){
if(p == tp)
break;
l = p;
}
/*
* ready(p) picks a new priority for a process and sticks it in the
* runq for that priority.
*/
void
ready(Proc *p)
{
int s, pri;
Schedq *rq;
void (*pt)(Proc*, int, vlong);
/*
* yield the processor and drop our priority
*/
void
yield(void)
{
if(anyready()){
/* pretend we just used 1/2 tick */
up->lastupdate -= Scaling/2;
sched();
}
}
/*
* recalculate priorities once a second. We need to do this
* since priorities will otherwise only be recalculated when
* the running process blocks.
*/
ulong balancetime;
loop:
/*
* find a process that last ran on this processor (affinity),
* or one that can be moved to this processor.
*/
spllo();
for(i = 0;; i++){
/*
* find the highest priority target process that this
* processor can run given affinity constraints.
*
*/
for(rq = &runq[Nrq-1]; rq >= runq; rq--){
for(p = rq->head; p != nil; p = p->rnext){
if(p->mp == nil || p->mp == MACHP(m->machno)
|| (p->wired == nil && i > 0))
goto found;
}
}
/* waste time or halt the CPU */
idlehands();
/* remember how much time we're here */
now = perfticks();
m->perf.inidle += now-start;
start = now;
}
/*
* sleep if a condition is not true. Another process will
* awaken us after it sets the condition. When we awaken
* the condition may no longer be true.
*
* we lock both the process and the rendezvous to keep r->p
* and p->r synchronized.
*/
void
sleep(Rendez *r, int (*f)(void*), void *arg)
{
int s;
void (*pt)(Proc*, int, vlong);
s = splhi();
if(up->nlocks)
print("process %lud sleeps with %d locks held, last lock %#p locked at pc %#p, sleep called from %#p\n",
up->pid, up->nlocks, up->lastlock, up->lastlock->pc, getcallerpc(&r));
lock(r);
lock(&up->rlock);
if(r->p != nil){
print("double sleep called from %#p, %lud %lud\n", getcallerpc(&r), r->p->pid, up->pid);
dumpstack();
}
/*
* Wakeup only knows there may be something to do by testing
* r->p in order to get something to lock on.
* Flush that information out to memory in case the sleep is
* committed.
*/
r->p = up;
if((*f)(arg) || up->notepending){
/*
* if condition happened or a note is pending
* never mind
*/
r->p = nil;
unlock(&up->rlock);
unlock(r);
} else {
/*
* now we are committed to
* change state and call scheduler
*/
pt = proctrace;
if(pt != nil)
pt(up, SSleep, 0);
up->state = Wakeme;
up->r = r;
/* statistics */
m->cs++;
procsave(up);
if(setlabel(&up->sched)) {
/*
* here when the process is awakened
*/
procrestore(up);
spllo();
} else {
/*
* here to go to sleep (i.e. stop Running)
*/
unlock(&up->rlock);
unlock(r);
gotolabel(&m->sched);
}
}
/*
* Expects that only one process can call wakeup for any given Rendez.
* We hold both locks to ensure that r->p and p->r remain consistent.
* Richard Miller has a better solution that doesn't require both to
* be held simultaneously, but I'm a paranoid - presotto.
*/
Proc*
wakeup(Rendez *r)
{
Proc *p;
int s;
/*
* if waking a sleeping process, this routine must hold both
* p->rlock and r->lock. However, it can't know them in
* the same order as wakeup causing a possible lock ordering
* deadlock. We break the deadlock by giving up the p->rlock
* lock if we can't get the r->lock and retrying.
*/
int
postnote(Proc *p, int dolock, char *n, int flag)
{
int s, ret;
QLock *q;
lock(&p->exl);
/*
* Check that parent is still alive.
*/
if(p->pid == up->parentpid && p->state != Broken) {
p->nchild--;
p->time[TCUser] += utime;
p->time[TCSys] += stime;
/*
* If there would be more than 128 wait records
* processes for my parent, then don't leave a wait
* record behind. This helps prevent badly written
* daemon processes from accumulating lots of wait
* records.
*/
if(p->nwait < 128) {
wq->next = p->waitq;
p->waitq = wq;
p->nwait++;
wq = nil;
wakeup(&p->waitr);
}
}
unlock(&p->exl);
if(wq != nil)
free(wq);
}
if(!freemem)
addbroken(up);
qlock(&up->debug);
lock(&up->exl); /* Prevent my children from leaving waits */
pidfree(up);
up->parent = nil;
up->nchild = up->nwait = 0;
wakeup(&up->waitr);
unlock(&up->exl);
s = p->psstate;
if(s == nil)
s = statename[p->state];
print("%3lud:%10s pc %#p dbgpc %#p %8s (%s) ut %ld st %ld bss %lux qpc %#p nl %d nd %lud lpc %#p pri %lud\n",
p->pid, p->text, p->pc, dbgpc(p), s, statename[p->state],
p->time[0], p->time[1], bss, p->qpc, p->nlocks, p->delaysched,
p->lastlock ? p->lastlock->pc : 0, p->priority);
}
/*
* wait till all matching processes have flushed their mmu
*/
static void
procflushmmu(int (*match)(Proc*, void*), void *a)
{
Proc *await[MAXMACH];
int i, nm, nwait;
Proc *p;
/*
* wait for all other processors to take a clock interrupt
* and flush their mmu's
*/
for(;;){
if(nwait == 0 || nwait == 1 && await[m->machno] != nil)
break;
/* only one processor gets to compute system load averages */
if(m->machno != 0)
return;
/*
* calculate decaying load average.
* if we decay by (n-1)/n then it takes
* n clock ticks to go from load L to .36 L once
* things quiet down. it takes about 5 n clock
* ticks to go to zero. so using HZ means this is
* approximately the load over the last second,
* with a tail lasting about 5 seconds.
*/
n = nrun;
nrun = 0;
n = (nrdy+n)*1000*100;
load = ((uvlong)load*(HZ-1)+n)/HZ;
m->load = load/100;
}
/*
* A Pid structure is a reference counted hashtable entry
* with "pid" being the key and "procindex" being the value.
* A entry is allocated atomically by changing the key from
* negative or zero to the positive process id number.
* Pid's outlive ther Proc's as long as other processes hold
* a reference to them such as noteid or parentpid.
* This prevents pid reuse when the pid generator wraps.
*/
typedef struct Pid Pid;
struct Pid
{
Ref;
long pid;
int procindex;
};
enum {
PIDMASK = 0x7FFFFFFF,
PIDSHIFT = 4, /* log2 bucket size of the hash table */
};
static Pid *pidhashtab;
static ulong pidhashmask;
static void
pidinit(void)
{
/*
* allocate 3 times conf.nproc Pid structures for the hash table
* and round up to a power of two as each process can reference
* up to 3 unique Pid structures:
* - pid
* - noteid
* - parentpid
*/
pidhashmask = 1<<PIDSHIFT;
while(pidhashmask < conf.nproc*3)
pidhashmask <<= 1;
i = &pidhashtab[(pid<<PIDSHIFT) & pidhashmask];
for(e = &i[1<<PIDSHIFT]; i < e; i++){
while((o = i->pid) <= 0){
if(cmpswap(&i->pid, o, pid)){
incref(i);
return i;
}
}
}
/* bucket full, try a different pid */
goto Again;
}
/*
* decrement reference count of a pid and free it
* when no references are remaining
*/
static void
piddel(Pid *i)
{
if(decref(i))
return;
i->pid = -1; /* freed */
}
int
procindex(ulong pid)
{
Pid *i;
i = pidlookup(pid);
if(i != nil){
int x = i->procindex;
if(proctab(x)->pid == pid)
return x;
}
return -1;
}