From: Jachym Holecek <[email protected]>

Index: Makefile
===================================================================
RCS file: /cvsroot/src/sbin/init/Makefile,v
retrieving revision 1.35
diff -u -p -r1.35 Makefile
--- Makefile    13 Jan 2005 03:21:59 -0000      1.35
+++ Makefile    9 Sep 2005 12:44:29 -0000
@@ -7,10 +7,10 @@ DPADD=        ${LIBUTIL}
LDADD= -lutil
CPPFLAGS+=     -DMFS_DEV_IF_NO_CONSOLE -DSUPPORT_UTMP -DSUPPORT_UTMPX

-.ifdef SMALLPROG
+.if defined(SMALLPROG) && !defined(CHROOT_INIT)
CPPFLAGS+=     -DLETS_GET_SMALL
.else
-CPPFLAGS+=     -DALTSHELL -DSECURE
+CPPFLAGS+=     -DALTSHELL -DSECURE -DCHROOT
DPADD+=                ${LIBCRYPT}
LDADD+=                -lcrypt
.endif
Index: NOTES
===================================================================
RCS file: /cvsroot/src/sbin/init/NOTES,v
retrieving revision 1.2
diff -u -p -r1.2 NOTES
--- NOTES       18 Mar 1995 14:56:29 -0000      1.2
+++ NOTES       9 Sep 2005 12:44:29 -0000
@@ -101,8 +101,12 @@ init is responsible for utmp and wtmp ma

necessary states and state transitions (gleaned from the man page):
       1: single user shell (with password checking?); on exit, go to 2
-       2: rc script: on exit 0, go to 3; on exit N (error), go to 1
-       3: read ttys file: on completion, go to 4
+       2: run rc script, if init.root sysctl != "/", fork + chroot(init.root)
+          and run /etc/rc: on exit 0, go to 3; on exit N (error), go to 1
+          (applies to both /etc/rc and potentially chroot:/etc/rc)
+       3: read ttys file: on completion, go to 4. if we did chroot in
+          state 2, we chroot after forking each getty to the same dir
+          (init.root is not re-read)
       4: multi-user operation: on SIGTERM, go to 7; on SIGHUP, go to 5;
               on SIGTSTP, go to 6
       5: clean up mode (re-read ttys file, killing off controlling processes
Index: init.8
===================================================================
RCS file: /cvsroot/src/sbin/init/init.8,v
retrieving revision 1.36
diff -u -p -r1.36 init.8
--- init.8      1 Jul 2005 17:07:33 -0000       1.36
+++ init.8      9 Sep 2005 12:44:29 -0000
@@ -1,4 +1,4 @@
-.\"    $NetBSD: init.8,v 1.36 2005/07/01 17:07:33 wiz Exp $
+.\"    $NetBSD: init.8,v 1.32 2005/04/01 20:40:17 abs Exp $
.\"
.\" Copyright (c) 1980, 1991, 1993
.\"    The Regents of the University of California.  All rights reserved.
@@ -32,7 +32,7 @@
.\"
.\"     @(#)init.8     8.6 (Berkeley) 5/26/95
.\"
-.Dd June 30, 2005
+.Dd April 1, 2005
.Dt INIT 8
.Os
.Sh NAME
@@ -78,11 +78,28 @@ exits with a non-zero (error) exit code,
operation by giving the super-user a shell on the console by going
to state 1 (single user).
Otherwise, proceed to state 3.
+.Pp
+If value of the
+.Dq init.root
+sysctl node is not equal to
+.Pa /
+at this point, the
+.Pa /etc/rc
+process will be run inside a
+.Xr chroot 2
+indicated by sysctl with the same error handling as above.
.It
Set up ttys as specified in
.Xr ttys 5 .
See below for more information.
On completion, continue to state 4.
+If we did chroot in state 2, each
+.Xr getty 8
+process will be run in the same
+.Xr chroot 2
+path as in 2 (that is, the value of
+.Dq init.root
+sysctl is not re-read).
.It
Multi-user operation.
Depending upon the signal received, change state appropriately;
@@ -130,6 +147,26 @@ The password check is skipped if the
is marked as
.Dq secure .
.Pp
+It should be noted that while
+.Nm
+has the ability to start multi-user operation inside a
+.Xr chroot 2
+environment, the
+.Nm
+process itself will always run in the
+.Dq original root directory .
+This also implies that single-user mode is always started in the original
+root, giving the possibility to create multi-user sessions in different
+root directories over time. The
+.Dq init.root
+sysctl node is fabricated by
+.Nm
+at startup and re-created any time it's found to be missing. Type of the
+node is string capable of holding full pathname, and is only accessible by
+the superuser (unless explicitly destroyed and re-created with different
+specification). The node becomes read-only after securelevel 1 has been
+reached.
+.Pp
The kernel runs with four different levels of security.
Any superuser process can raise the security level, but only
.Nm
@@ -221,9 +258,9 @@ in the kernel configuration file, which
.Va securelevel
variable to -1.
See
-.Xr config 1
-and
.Xr options 4
+and
+.Xr config 8
for details.
.Pp
In multi-user operation,
@@ -337,32 +374,6 @@ If, at bootstrap time, the
.Nm
process cannot be located, the system will panic with the message
.Dq panic: init died (signal %d, exit %d) .
-.Pp
-If
-.Pa /dev/console
-does not exist,
-.Nm
-will create a MFS (memory based file system) mounted over
-.Pa /dev .
-Then it will create a
-.Pa /dev/console
-device so you can see things happening.
-The
-.Xr MAKEDEV 8
-and
-.Pa MAKEDEV.local
-scripts are placed in the new
-.Pa /dev
-directory.
-Then
-.Nm
-changes the working directory to
-.Pa /dev
-and runs the scripts using the
-.Dq init
-special target.
-This creates the standard devices considered necessary to boot the
-system.
.Sh FILES
.Bl -tag -width /var/log/wtmp -compact
.It Pa /dev/console
@@ -395,19 +406,18 @@ This condition is usually caused by a pr
device driver because of a persistent device error condition.
.El
.Sh SEE ALSO
-.Xr config 1 ,
.Xr kill 1 ,
.Xr login 1 ,
.Xr sh 1 ,
.Xr options 4 ,
.Xr ttys 5 ,
-.Xr MAKEDEV 8 ,
+.Xr config 8 ,
.Xr getty 8 ,
.Xr halt 8 ,
-.Xr mfs 8 ,
.Xr rc 8 ,
.Xr reboot 8 ,
-.Xr shutdown 8
+.Xr shutdown 8 ,
+.Xr sysctl 8
.Sh HISTORY
A
.Nm
Index: init.c
===================================================================
RCS file: /cvsroot/src/sbin/init/init.c,v
retrieving revision 1.70
diff -u -p -r1.70 init.c
--- init.c      27 Jun 2005 01:00:05 -0000      1.70
+++ init.c      9 Sep 2005 12:44:30 -0000
@@ -135,12 +135,6 @@ state_func_t death(void);
enum { AUTOBOOT, FASTBOOT } runcom_mode = AUTOBOOT;

void transition(state_t);
-#ifndef LETS_GET_SMALL
-state_t requested_transition = runcom;
-#else /* LETS_GET_SMALL */
-state_t requested_transition = single_user;
-#endif /* LETS_GET_SMALL */
-
void setctty(const char *);

typedef struct init_session {
@@ -174,16 +168,31 @@ int getsecuritylevel(void);
int setupargv(session_t *, struct ttyent *);
int clang;

-#ifndef LETS_GET_SMALL
-void clear_session_logs(session_t *, int);
-#endif
-
int start_session_db(void);
void add_session(session_t *);
void del_session(session_t *);
session_t *find_session(pid_t);
DB *session_db;

+int do_setttyent(void);
+
+#ifndef LETS_GET_SMALL
+state_t requested_transition = runcom;
+
+void clear_session_logs(session_t *, int);
+state_func_t runetcrc(int);
+
+#ifdef CHROOT
+int did_multiuser_chroot = 0;
+char rootdir[PATH_MAX];
+int shouldchroot(void);
+int createsysctlnode(void);
+#endif /* CHROOT */
+
+#else /* LETS_GET_SMALL */
+state_t requested_transition = single_user;
+#endif /* !LETS_GET_SMALL */
+
#ifdef MFS_DEV_IF_NO_CONSOLE

#define NINODE 1024
@@ -309,6 +318,11 @@ main(int argc, char **argv)
       (void)close(1);
       (void)close(2);

+#if !defined(LETS_GET_SMALL) && defined(CHROOT)
+       /* Create "init.root" sysctl node. */
+       createsysctlnode();
+#endif /* !LETS_GET_SMALL && CHROOT*/
+
       /*
        * Start the state machine.
        */
@@ -392,6 +406,29 @@ warning(const char *message, ...)
       vsyslog(LOG_ALERT, message, ap);
       va_end(ap);
       closelog();
+
+#if 0
+       /*
+        * XXX: Syslog seems to just plain not work in console-only
+        * XXX: situation... that should be fixed. Let's leave this
+        * XXX: note + code here in case someone gets in trouble and
+        * XXX: wants to debug. -- jachym
+        */
+       {
+               char            errbuf[1024];
+               int             fd, len;
+
+               /* We can't do anything on errors, anyway... */
+               fd = open(_PATH_CONSOLE, O_WRONLY);
+               if (fd == -1)
+                       return ;
+
+               /* %m will get lost... */
+               len = vsnprintf(errbuf, sizeof(errbuf), message, ap);
+               (void)write(fd, (void *)errbuf, len);
+               (void)close(fd);
+       }
+#endif
}

/*
@@ -566,6 +603,11 @@ single_user(void)
       char altshell[128];
#endif /* ALTSHELL */

+#if !defined(LETS_GET_SMALL) && defined(CHROOT)
+       /* Clear previous idea, just in case. */
+       did_multiuser_chroot = 0;
+#endif /* !LETS_GET_SMALL && CHROOT */
+
       /*
        * If the kernel is in secure mode, downgrade it to insecure mode.
        */
@@ -722,11 +764,10 @@ single_user(void)
}

#ifndef LETS_GET_SMALL
-/*
- * Run the system startup script.
- */
+
+/* ARGSUSED */
state_func_t
-runcom(void)
+runetcrc(int trychroot)
{
       pid_t pid, wpid;
       int status;
@@ -745,11 +786,20 @@ runcom(void)

               argv[0] = "sh";
               argv[1] = _PATH_RUNCOM;
-               argv[2] = runcom_mode == AUTOBOOT ? "autoboot" : 0;
+               argv[2] = (runcom_mode == AUTOBOOT ? "autoboot" : 0);
               argv[3] = 0;

               (void)sigprocmask(SIG_SETMASK, &sa.sa_mask, NULL);

+#ifdef CHROOT
+               if (trychroot)
+                       if (chroot(rootdir) != 0) {
+                               warning("failed to chroot to %s, error: %m",
+                                   rootdir);
+                               _exit(1);       /* force single user mode */
+                       }
+#endif /* CHROOT */
+
               (void)execv(INIT_BSHELL, __UNCONST(argv));
               stall("can't exec %s for %s: %m", INIT_BSHELL, _PATH_RUNCOM);
               _exit(1);       /* force single user mode */
@@ -805,6 +855,44 @@ runcom(void)
       if (WEXITSTATUS(status))
               return (state_func_t)single_user;

+       return (state_func_t) read_ttys;
+}
+
+/*
+ * Run the system startup script.
+ */
+state_func_t
+runcom(void)
+{
+       state_func_t next_step;
+
+       /* Run /etc/rc and choose next state depending on result. */
+       next_step = runetcrc(0);
+       if (next_step != (state_func_t) read_ttys)
+               return (state_func_t) next_step;
+
+#ifdef CHROOT
+       /*
+        * If init.root sysctl does not point to "/", we'll chroot and
+        * run "the real" /etc/rc now. Global variable rootdir will tell
+        * us where to go.
+        */
+       if (shouldchroot()) {
+               next_step = runetcrc(1);
+               if (next_step != (state_func_t) read_ttys)
+                       return (state_func_t) next_step;
+
+               did_multiuser_chroot = 1;
+       } else {
+               did_multiuser_chroot = 0;
+       }
+#endif /* CHROOT */
+
+       /*
+        * Regardless of whether in chroot or no, we booted successfuly.
+        * It's time to spawn gettys (ie. next_step's value at this point).
+        * Note that /etc/ttys will _not_ be read from under chroot.
+        */
       runcom_mode = AUTOBOOT;         /* the default */
       /* NB: should send a message to the session logger to avoid blocking. */
#ifdef SUPPORT_UTMPX
@@ -1029,8 +1117,19 @@ read_ttys(void)
               free_session(sp);
       }
       sessions = NULL;
-       if (start_session_db())
-               return (state_func_t)single_user;
+
+       if (start_session_db()) {
+               warning("read_ttys: start_session_db failed, death\n");
+               /* If /etc/rc run under chroot, we want to kill survivors. */
+#ifdef CHROOT
+               if (did_multiuser_chroot)
+                       return (state_func_t)death;
+               else
+#endif /* CHROOT */
+                       return (state_func_t)single_user;
+       }
+
+       do_setttyent();

       /*
        * Allocate a session entry for each active port.
@@ -1039,7 +1138,6 @@ read_ttys(void)
       while ((typ = getttyent()) != NULL)
               if ((snext = new_session(sp, ++session_index, typ)) != NULL)
                       sp = snext;
-
       endttyent();

       return (state_func_t)multi_user;
@@ -1097,6 +1195,16 @@ start_getty(session_t *sp)
       if (pid)
               return pid;

+#ifdef CHROOT
+       /* If /etc/rc did proceed inside chroot, we have to try as well. */
+       if (did_multiuser_chroot)
+               if (chroot(rootdir) != 0) {
+                       stall("can't chroot getty '%s' inside %s: %m",
+                           sp->se_getty_argv[0], rootdir);
+                       _exit(1);
+               }
+#endif /* CHROOT */
+
       if (current_time > sp->se_started &&
           current_time - sp->se_started < GETTY_SPACING) {
               warning("getty repeating too quickly on port %s, sleeping",
@@ -1244,6 +1352,8 @@ clean_ttys(void)
       for (sp = sessions; sp; sp = sp->se_next)
               sp->se_flags &= ~SE_PRESENT;

+       do_setttyent();
+
       devlen = sizeof(_PATH_DEV) - 1;
       while ((typ = getttyent()) != NULL) {
               ++session_index;
@@ -1492,6 +1602,7 @@ mfs_dev(void)
                           mfile[0].len ? "./MAKEDEV" : "/etc/MAKEDEV",
                           "init", NULL);
               _exit(1);
+               /* NOTREACHED */

       case -1:
               break;
@@ -1509,3 +1620,113 @@ mfs_dev(void)
       return (-1);
}
#endif
+
+int
+do_setttyent(void)
+{
+       endttyent();
+#ifdef CHROOT
+       if (did_multiuser_chroot) {
+               char                    path[PATH_MAX];
+
+               snprintf(path, sizeof(path), "%s/%s", rootdir, _PATH_TTYS);
+
+               return setttyent1(path);
+       } else
+#endif /* CHROOT */
+               return setttyent();
+}
+
+#if !defined(LETS_GET_SMALL) && defined(CHROOT)
+
+int
+createsysctlnode()
+{
+       struct sysctlnode               node;
+       int                             mib[2];
+       size_t                          len;
+
+       /*
+        * Create toplevel dynamic sysctl node. Its child nodes will only
+        * be readable by the superuser, since regular mortals should not
+        * care ("Sss, it's secret!"). Additionally, both nodes become
+        * readonly at securelevel 1.
+        */
+       len = sizeof(struct sysctlnode);
+       mib[0] = CTL_CREATE;
+
+       memset(&node, 0, len);
+       node.sysctl_flags = SYSCTL_VERSION | CTLFLAG_READWRITE |
+           CTLFLAG_PRIVATE | CTLFLAG_READONLY1 | CTLTYPE_NODE;
+       node.sysctl_num = CTL_CREATE;
+       snprintf(node.sysctl_name, SYSCTL_NAMELEN, "init");
+       if (sysctl(&mib[0], 1, &node, &len, &node, len) == -1) {
+               warning("could not create init node, error = %d", errno);
+               return (-1);
+       }
+
+       /*
+        * Create second level dynamic node capable of holding pathname.
+        * Provide / as default value.
+        */
+       len = sizeof(struct sysctlnode);
+       mib[0] = node.sysctl_num;
+       mib[1] = CTL_CREATE;
+
+       memset(&node, 0, len);
+       node.sysctl_flags = SYSCTL_VERSION | CTLFLAG_READWRITE |
+           CTLFLAG_READONLY1 | CTLTYPE_STRING | CTLFLAG_OWNDATA;
+       node.sysctl_size = _POSIX_PATH_MAX;
+       node.sysctl_data = __UNCONST("/");
+       node.sysctl_num = CTL_CREATE;
+       snprintf(node.sysctl_name, SYSCTL_NAMELEN, "root");
+       if (sysctl(&mib[0], 2, NULL, NULL, &node, len) == -1) {
+               warning("could not create init.root node, error = %d", errno);
+               return (-1);
+       }
+
+       return (0);
+}
+
+int
+shouldchroot()
+{
+       struct sysctlnode               node;
+       size_t                          len, cnt;
+       int                             mib;
+
+       if (sysctlbyname("init.root", rootdir, &len, NULL, 0) == -1) {
+               warning("could not read init.root, error = %d", errno);
+
+               /* Child killed our node. Recreate it. */
+               if (errno == ENOENT) {
+                       /* Destroy whatever left, recreate completely. */
+                       if (sysctlnametomib("init", &mib, &cnt) != -1) {
+                               memset(&node, 0, sizeof(node));
+                               node.sysctl_flags = SYSCTL_VERSION;
+                               node.sysctl_num = mib;
+                               mib = CTL_DESTROY;
+
+                               (void)sysctl(&mib, 1, NULL, NULL, &node,
+                                   sizeof(node));
+                       }
+
+                       createsysctlnode();
+               }
+
+               /* We certainly won't chroot. */
+               return (0);
+       }
+
+       if (rootdir[len] != '\0' || strlen(rootdir) != len - 1) {
+               warning("init.root is not a string");
+               return (0);
+       }
+
+       if (strcmp(rootdir, "/") == 0)
+               return (0);
+
+       return (1);
+}
+
+#endif /* !LETS_GET_SMALL && CHROOT */