runc作为容器的运行时,现在作为独立的项目来进行发展,runc提供一套简单的容器运行环境,包括进程的命名空间、cgroups和文件系统权限等管理的功能,runc是基于oci标准的产物,可以让大家都通过统一的接口来进行运行时的操作。其本质的管理工作也是最主要的几个重要的函数clone,unshare和setns等重要的操作函数。
调用该命令的时候是执行位于run.go文件里面的command。
var runCommand = cli.Command{
Name: "run",
Usage: "create and run a container",
ArgsUsage: `<container-id>
Where "<container-id>" is your name for the instance of the container that you
are starting. The name you provide for the container instance must be unique on
your host.`,
Description: `The run command creates an instance of a container for a bundle. The bundle
is a directory with a specification file named "` + specConfig + `" and a root
filesystem.
The specification file includes an args parameter. The args parameter is used
to specify command(s) that get run when the container is started. To change the
command(s) that get executed on start, edit the args parameter of the spec. See
"runc spec --help" for more explanation.`,
Flags: []cli.Flag{
cli.StringFlag{
Name: "bundle, b",
Value: "",
Usage: `path to the root of the bundle directory, defaults to the current directory`,
cli.StringFlag{
Name: "console-socket",
Value: "",
Usage: "path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal",
cli.BoolFlag{
Name: "detach, d",
Usage: "detach from the container's process",
cli.StringFlag{
Name: "pid-file",
Value: "",
Usage: "specify the file to write the process id to",
cli.BoolFlag{
Name: "no-subreaper",
Usage: "disable the use of the subreaper used to reap reparented processes",
cli.BoolFlag{
Name: "no-pivot",
Usage: "do not use pivot root to jail process inside rootfs. This should be used whenever the rootfs is on top of a ramdisk",
cli.BoolFlag{
Name: "no-new-keyring",
Usage: "do not create a new session keyring for the container. This will cause the container to inherit the calling processes session key",
cli.IntFlag{
Name: "preserve-fds",
Usage: "Pass N additional file descriptors to the container (stdio + $LISTEN_FDS + N in total)",
Action: func(context *cli.Context) error {
if err := checkArgs(context, 1, exactArgs); err != nil {
return err
if err := revisePidFile(context); err != nil {
return err
spec, err := setupSpec(context)
if err != nil {
return err
status, err := startContainer(context, spec, CT_ACT_RUN, nil)
if err == nil {
os.Exit(status)
return err
从流程上面看,主要的内容就是先生成运行的配置文件,然后再进行startContainer的函数执行,该函数就是将整个容器运行时的启动与管理功能。
func startContainer(context *cli.Context, spec *specs.Spec, action CtAct, criuOpts *libcontainer.CriuOpts) (int, error) {
id := context.Args().First()
if id == "" {
return -1, errEmptyID
notifySocket := newNotifySocket(context, os.Getenv("NOTIFY_SOCKET"), id)
if notifySocket != nil {
if err := notifySocket.setupSpec(context, spec); err != nil {
return -1, err
container, err := createContainer(context, id, spec)
if err != nil {
return -1, err
if notifySocket != nil {
if err := notifySocket.setupSocketDirectory(); err != nil {
return -1, err
if action == CT_ACT_RUN {
if err := notifySocket.bindSocket(); err != nil {
return -1, err
listenFDs := []*os.File{}
if os.Getenv("LISTEN_FDS") != "" {
listenFDs = activation.Files(false)
logLevel := "info"
if context.GlobalBool("debug") {
logLevel = "debug"
r := &runner{
enableSubreaper: !context.Bool("no-subreaper"),
shouldDestroy: true,
container: container,
listenFDs: listenFDs,
notifySocket: notifySocket,
consoleSocket: context.String("console-socket"),
detach: context.Bool("detach"),
pidFile: context.String("pid-file"),
preserveFDs: context.Int("preserve-fds"),
action: action,
criuOpts: criuOpts,
init: true,
logLevel: logLevel,
return
r.run(spec.Process)
此时就是通过runner的run函数进行进一步工作。首先查看一下createContainer函数看看
func createContainer(context *cli.Context, id string, spec *specs.Spec) (libcontainer.Container, error) {
rootlessCg, err := shouldUseRootlessCgroupManager(context)
if err != nil {
return nil, err
config, err := specconv.CreateLibcontainerConfig(&specconv.CreateOpts{
CgroupName: id,
UseSystemdCgroup: context.GlobalBool("systemd-cgroup"),
NoPivotRoot: context.Bool("no-pivot"),
NoNewKeyring: context.Bool("no-new-keyring"),
Spec: spec,
RootlessEUID: os.Geteuid() != 0,
RootlessCgroups: rootlessCg,
})
if err != nil {
return nil, err
factory, err := loadFactory(context)
if err != nil {
return nil, err
return factory.Create(id, config)
此时我们查看一下loadFactory的方法。
func New(root string, options ...func(*LinuxFactory) error) (Factory, error) {
if root != "" {
if err := os.MkdirAll(root, 0o700); err != nil {
return nil, newGenericError(err, SystemError)
l := &LinuxFactory{
Root: root,
InitPath: "/proc/self/exe",
InitArgs: []string{os.Args[0], "init"},
Validator: validate.New(),
CriuPath: "criu",
if err := Cgroupfs(l); err != nil {
return nil, err
for _, opt := range options {
if opt == nil {
continue
if err := opt(l); err != nil {
return nil, err
return l, nil
func loadFactory(context *cli.Context) (libcontainer.Factory, error) {
root := context.GlobalString("root")
abs, err := filepath.Abs(root)
if err != nil {
return nil, err
cgroupManager := libcontainer.Cgroupfs
rootlessCg, err := shouldUseRootlessCgroupManager(context)
if err != nil {
return nil, err
if rootlessCg {
cgroupManager = libcontainer.RootlessCgroupfs
if context.GlobalBool("systemd-cgroup") {
if !systemd.IsRunningSystemd() {
return nil, errors.New("systemd cgroup flag passed, but systemd support for managing cgroups is not available")
cgroupManager = libcontainer.SystemdCgroups
if rootlessCg {
cgroupManager = libcontainer.RootlessSystemdCgroups
intelRdtManager := libcontainer.IntelRdtFs
newuidmap, err := exec.LookPath("newuidmap")
if err != nil {
newuidmap = ""
newgidmap, err := exec.LookPath("newgidmap")
if err != nil {
newgidmap = ""
return libcontainer.New(abs, cgroupManager, intelRdtManager,
libcontainer.CriuPath(context.GlobalString("criu")),
libcontainer.NewuidmapPath(newuidmap),
libcontainer.NewgidmapPath(newgidmap))
从函数流程可知,初始化了一个path为自己,输入参数为init的入口配置信息,接着我们查看Create的内容。
func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, error) {
if l.Root == "" {
return nil, newGenericError(fmt.Errorf("invalid root"), ConfigInvalid)
if err := l.validateID(id); err !=
nil {
return nil, err
if err := l.Validator.Validate(config); err != nil {
return nil, newGenericError(err, ConfigInvalid)
containerRoot, err := securejoin.SecureJoin(l.Root, id)
if err != nil {
return nil, err
if _, err := os.Stat(containerRoot); err == nil {
return nil, newGenericError(fmt.Errorf("container with id exists: %v", id), IdInUse)
} else if !os.IsNotExist(err) {
return nil, newGenericError(err, SystemError)
if err := os.MkdirAll(containerRoot, 0o711); err != nil {
return nil, newGenericError(err, SystemError)
if err := os.Chown(containerRoot, unix.Geteuid(), unix.Getegid()); err != nil {
return nil, newGenericError(err, SystemError)
c := &linuxContainer{
id: id,
root: containerRoot,
config: config,
initPath: l.InitPath,
initArgs: l.InitArgs,
criuPath: l.CriuPath,
newuidmapPath: l.NewuidmapPath,
newgidmapPath: l.NewgidmapPath,
cgroupManager: l.NewCgroupsManager(config.Cgroups, nil),
}
if l.NewIntelRdtManager != nil {
c.intelRdtManager = l.NewIntelRdtManager(config, id, "")
c.state = &stoppedState{c: c}
return c, nil
通过createContainer函数,就进行了一系列文件权限与输入参数的前置工作。
现在我们查看一下runner的run方法是如果工作的。
func (r *runner) run(config *specs.Process) (int, error) {
var err error
defer func() {
if err != nil {
r.destroy()
}()
if err = r.checkTerminal(config); err != nil {
return -1, err
process, err := newProcess(*config, r.init, r.logLevel)
if err != nil {
return -1, err
if len(r.listenFDs) > 0 {
process.Env = append(process.Env, "LISTEN_FDS="+strconv.Itoa(len(r.listenFDs)), "LISTEN_PID=1")
process.ExtraFiles = append(process.ExtraFiles, r.listenFDs...)
baseFd := 3 + len(process.ExtraFiles)
for i := baseFd; i < baseFd+r.preserveFDs; i++ {
_, err = os.Stat("/proc/self/fd/" + strconv.Itoa(i))
if err != nil {
return -1, errors.Wrapf(err, "please check that preserved-fd %d (of %d) is present", i-baseFd, r.preserveFDs)
process.ExtraFiles = append(process.ExtraFiles, os.NewFile(uintptr(i), "PreserveFD:"+strconv.Itoa(i)))
rootuid, err := r.container.Config().HostRootUID()
if err != nil {
return -1, err
rootgid, err := r.container.Config().HostRootGID()
if err != nil {
return -1, err
detach := r.detach || (r.action == CT_ACT_CREATE)
handler := newSignalHandler(r.enableSubreaper, r.notifySocket)
tty, err := setupIO(process, rootuid, rootgid, config.
Terminal, detach, r.consoleSocket)
if err != nil {
return -1, err
defer tty.Close()
switch r.action {
case CT_ACT_CREATE:
err = r.container.Start(process)
case CT_ACT_RESTORE:
err = r.container.Restore(process, r.criuOpts)
case CT_ACT_RUN:
err = r.container.Run(process)
default:
panic("Unknown action")
if err != nil {
return -1, err
if err = tty.waitConsole(); err != nil {
r.terminate(process)
return -1, err
if err = tty.ClosePostStart(); err != nil {
r.terminate(process)
return -1, err
if r.pidFile != "" {
if err = createPidFile(r.pidFile, process); err != nil {
r.terminate(process)
return -1, err
status, err := handler.forward(process, tty, detach)
if err != nil {
r.terminate(process)
if detach {
return 0, nil
if err == nil {
r.destroy()
return status, err
所有重要的逻辑信息就都放在了container的Run方法中执行。
func (c *linuxContainer) Run(process *Process) error {
if err := c.Start(process); err != nil {
return err
if process.Init {
return c.exec()
return nil
func (c *linuxContainer) Start(process *Process) error {
c.m.Lock()
defer c.m.Unlock()
if c.config.Cgroups.Resources.SkipDevices {
return newGenericError(errors.New("can't start container with SkipDevices set"), ConfigInvalid)
if process.Init {
if err := c.createExecFifo(); err != nil {
return err
if err := c.start(process); err != nil {
if process.Init {
c.deleteExecFifo()
return err
return nil
此时继续查看linuxContainer的start方法。
func (c *linuxContainer) start(process *Process) (retErr error) {
parent, err := c.newParentProcess(process)
if err != nil {
return newSystemErrorWithCause(err, "creating new parent process")
logsDone := parent.forwardChildLogs()
if logsDone != nil {
defer func() {
err := <-logsDone
if err != nil && retErr == nil {
retErr = newSystemErrorWithCause(err, "forwarding init logs")
}()
if err := parent.start(); err != nil {
return newSystemErrorWithCause(err, "starting container process")
if process.Init {
c.fifo.Close()
if c.config.Hooks != nil {
s, err := c.currentOCIState()
if err != nil {
return err
if err := c.config.Hooks[configs.Poststart].RunHooks(s); err != nil {
if err := ignoreTerminateErrors(parent.terminate()); err != nil {
logrus.Warn(errorsf.Wrapf(err, "Running Poststart hook"))
return err
return nil
首先是通过生成一个newParentProcess来生成一个父的process来进行启动。
func (c *linuxContainer) newParentProcess(p *Process) (parentProcess, error
) {
parentInitPipe, childInitPipe, err := utils.NewSockPair("init")
if err != nil {
return nil, newSystemErrorWithCause(err, "creating new init pipe")
messageSockPair := filePair{parentInitPipe, childInitPipe}
parentLogPipe, childLogPipe, err := os.Pipe()
if err != nil {
return nil, fmt.Errorf("Unable to create the log pipe: %s", err)
logFilePair := filePair{parentLogPipe, childLogPipe}
cmd := c.commandTemplate(p, childInitPipe, childLogPipe)
if !p.Init {
return c.newSetnsProcess(p, cmd, messageSockPair, logFilePair)
if err := c.includeExecFifo(cmd); err != nil {
return nil, newSystemErrorWithCause(err, "including execfifo in cmd.Exec setup")
return c.newInitProcess(p, cmd, messageSockPair, logFilePair)
func (c *linuxContainer) commandTemplate(p *Process, childInitPipe *os.File, childLogPipe *os.File) *exec.Cmd {
cmd := exec.Command(c.initPath, c.initArgs[1:]...)
cmd.Args[0] = c.initArgs[0]
cmd.Stdin = p.Stdin
cmd.Stdout = p.Stdout
cmd.Stderr = p.Stderr
cmd.Dir = c.config.Rootfs
if cmd.SysProcAttr == nil {
cmd.SysProcAttr = &unix.SysProcAttr{}
cmd.Env = append(cmd.Env, "GOMAXPROCS="+os.Getenv("GOMAXPROCS"))
cmd.ExtraFiles = append(cmd.ExtraFiles, p.ExtraFiles...)
if p.ConsoleSocket != nil {
cmd.ExtraFiles = append(cmd.ExtraFiles, p.ConsoleSocket)
cmd.Env = append(cmd.Env,
"_LIBCONTAINER_CONSOLE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1),
cmd.ExtraFiles = append(cmd.ExtraFiles, childInitPipe)
cmd.Env = append(cmd.Env,
"_LIBCONTAINER_INITPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1),
"_LIBCONTAINER_STATEDIR="+c.root,
cmd.ExtraFiles = append(cmd.ExtraFiles, childLogPipe)
cmd.Env = append(cmd.Env,
"_LIBCONTAINER_LOGPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1),
"_LIBCONTAINER_LOGLEVEL="+p.LogLevel,
if c.config.ParentDeathSignal > 0 {
cmd.SysProcAttr.Pdeathsig = unix.Signal(c.config.ParentDeathSignal)
return cmd
func (c *linuxContainer) newParentProcess(p *Process) (parentProcess, error) {
parentInitPipe, childInitPipe, err := utils.NewSockPair("init")
if err != nil {
return nil, newSystemErrorWithCause(err, "creating new init pipe")
messageSockPair := filePair{parentInitPipe, childInitPipe}
parentLogPipe, childLogPipe, err := os.Pipe()
if err != nil {
return nil, fmt.Errorf("Unable to create the log pipe: %s", err)
logFilePair := filePair{parentLogPipe, childLogPipe}
cmd := c.commandTemplate(p, childInitPipe, childLogPipe)
if !p.Init {
return c.newSetnsProcess(p, cmd, messageSockPair, logFilePair)
if err := c.includeExecFifo(cmd); err !=
nil {
return nil, newSystemErrorWithCause(err, "including execfifo in cmd.Exec setup")
return c.newInitProcess(p, cmd, messageSockPair, logFilePair)
func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, messageSockPair, logFilePair filePair) (*initProcess, error) {
cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard))
nsMaps := make(map[configs.NamespaceType]string)
for _, ns := range c.config.Namespaces {
if ns.Path != "" {
nsMaps[ns.Type] = ns.Path
_, sharePidns := nsMaps[configs.NEWPID]
data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps)
if err != nil {
return nil, err
init := &initProcess{
cmd: cmd,
messageSockPair: messageSockPair,
logFilePair: logFilePair,
manager: c.cgroupManager,
intelRdtManager: c.intelRdtManager,
config: c.newInitConfig(p),
container: c,
process: p,
bootstrapData: data,
sharePidns: sharePidns,
c.initProcess = init
return init, nil
初始化完成之后,就开始进入initProcess的start方法。
func (p *initProcess) start() (retErr error) {
defer p.messageSockPair.parent.Close()
err := p.cmd.Start()
p.process.ops = p
_ = p.messageSockPair.child.Close()
_ = p.logFilePair.child.Close()
if err != nil {
p.process.ops = nil
return newSystemErrorWithCause(err, "starting init process command")
waitInit := initWaiter(p.messageSockPair.parent)
defer func() {
if retErr != nil {
oom, err := p.manager.OOMKillCount()
if err != nil {
logrus.WithError(err).Warn("unable to get oom kill count")
} else if oom > 0 {
const oomError = "container init was OOM-killed (memory limit too low?)"
if logrus.GetLevel() >= logrus.DebugLevel {
retErr = newSystemErrorWithCause(retErr, oomError)
} else {
retErr = newSystemError(errors.New(oomError))
werr := <-waitInit
if werr != nil {
logrus.WithError(werr).Warn()
if err := ignoreTerminateErrors(p.terminate()); err != nil {
logrus.WithError(err).Warn("unable to terminate initProcess")
_ = p.manager.Destroy()
if p.intelRdtManager != nil {
_ = p.intelRdtManager.Destroy()
}()
if err := p.manager.Apply(p.pid()); err != nil {
return newSystemErrorWithCause(err, "applying cgroup configuration for process")
if p.intelRdtManager != nil {
if err := p.intelRdtManager.Apply(p.pid()); err != nil {
return newSystemErrorWithCause(err, "applying Intel RDT configuration for process")
if _, err := io.Copy(p.messageSockPair.parent, p.bootstrapData); err != nil {
return newSystemErrorWithCause(err,
"copying bootstrap data to pipe")
err = <-waitInit
if err != nil {
return err
childPid, err := p.getChildPid()
if err != nil {
return newSystemErrorWithCause(err, "getting the final child's pid from pipe")
fds, err := getPipeFds(childPid)
if err != nil {
return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", childPid)
p.setExternalDescriptors(fds)
if p.config.Config.Namespaces.Contains(configs.NEWCGROUP) && p.config.Config.Namespaces.PathOf(configs.NEWCGROUP) == "" {
if _, err := p.messageSockPair.parent.Write([]byte{createCgroupns}); err != nil {
return newSystemErrorWithCause(err, "sending synchronization value to init process")
if err := p.waitForChildExit(childPid); err != nil {
return newSystemErrorWithCause(err, "waiting for our first child to exit")
if err := p.createNetworkInterfaces(); err != nil {
return newSystemErrorWithCause(err, "creating network interfaces")
if err := p.updateSpecState(); err != nil {
return newSystemErrorWithCause(err, "updating the spec state")
if err := p.sendConfig(); err != nil {
return newSystemErrorWithCause(err, "sending config to init process")
var (
sentRun bool
sentResume bool
ierr := parseSync(p.messageSockPair.parent, func(sync *syncT) error {
switch sync.Type {
case procReady:
if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
return newSystemErrorWithCause(err, "setting rlimits for ready process")
if !p.config.Config.Namespaces.Contains(configs.NEWNS) {
if err := p.manager.Set(p.config.Config.Cgroups.Resources); err != nil {
return newSystemErrorWithCause(err, "setting cgroup config for ready process")
if p.intelRdtManager != nil {
if err := p.intelRdtManager.Set(p.config.Config); err != nil {
return newSystemErrorWithCause(err, "setting Intel RDT config for ready process")
if p.config.Config.Hooks != nil {
s, err := p.container.currentOCIState()
if err != nil {
return err
s.Pid = p.cmd.Process.Pid
s.Status = specs.StateCreating
hooks := p.config.Config.Hooks
if err := hooks[configs.Prestart].RunHooks(s); err != nil {
return err
if err := hooks[configs.CreateRuntime].RunHooks(s); err != nil {
return err
p.container.created = time.Now().UTC()
p.container.state = &createdState{
c: p.container,
state, uerr := p.container.updateState(p)
if uerr != nil {
return newSystemErrorWithCause(err, "store init state")
p.container.initProcessStartTime = state.InitProcessStartTime
if err := writeSync(p.messageSockPair.parent, procRun); err != nil {
return newSystemErrorWithCause(err, "writing syncT 'run'")
sentRun = true
case procHooks:
if err := p.manager.Set(p.config.Config.Cgroups.Resources); err != nil {
return newSystemErrorWithCause(err, "setting cgroup config for procHooks process")
if p.intelRdtManager != nil {
if err := p.intelRdtManager.Set(p.config.Config); err != nil {
return newSystemErrorWithCause(err, "setting Intel RDT config for procHooks process")
if p.config.Config.Hooks != nil {
s, err := p.container.currentOCIState()
if err != nil {
return err
s.Pid = p.cmd.Process.Pid
s.Status = specs.StateCreating
hooks := p.config.Config.Hooks
if err := hooks[configs.Prestart].RunHooks(s); err != nil {
return err
if err := hooks[configs.CreateRuntime].RunHooks(s); err != nil {
return err
if err := writeSync(p.messageSockPair.parent, procResume); err != nil {
return newSystemErrorWithCause(err, "writing syncT 'resume'")
sentResume = true
default:
return newSystemError(errors.New("invalid JSON payload from child"))
return nil
if !sentRun {
return newSystemErrorWithCause(ierr, "container init")
if p.config.Config.Namespaces.Contains(configs.NEWNS) && !sentResume {
return newSystemError(errors.New("could not synchronise after executing prestart and CreateRuntime hooks with container process"))
if err := unix.Shutdown(int(p.messageSockPair.parent.Fd()), unix.SHUT_WR); err != nil {
return newSystemErrorWithCause(err, "shutting down init pipe")
if ierr != nil {
_, _ = p.wait()
return ierr
return nil
整个函数就行通过与子进程的每次通信来控制进程的创建。
此时首先子进程执行的是runc init,执行的时候在包的首行里面,导入了nsenter包
_ "github.com/opencontainers/runc/libcontainer/nsenter"
# 该行代码会在 nsenter.go中导入的时候执行init导入,执行nsexec()函数
// +build linux,!gccgo
package nsenter
#cgo CFLAGS: -Wall
extern void nsexec();
void __attribute__((constructor)) init(void) {
nsexec();
import "C"
此时,首先执行的就是nsexec.c文件中的nsexec()函数,该函数的逻辑相对就是将创建流程分成两个子进程进行分别的进行,从而完成隔离环境的初始化。
void nsexec(void)
int pipenum;
jmp_buf env;
int sync_child_pipe[2], sync_grandchild_pipe[2];
struct nlconfig_t config = { 0 };
* Setup a pipe to send logs to the parent. This should happen
* first, because bail will use that pipe.
setup_logpipe();
* If we don't have an init pipe, just return to the go routine.
* We'll only get an init pipe for start or exec.
pipenum = initpipe();
if (pipenum == -1)
return;
* We need to re-exec if we are not in a cloned binary. This is necessary
* to ensure that containers won't be able to access the host binary
* through /proc/self/exe. See CVE-2019-5736.
if (ensure_cloned_binary() < 0)
bail("could not ensure we are a cloned binary");
* Inform the parent we're past initial setup.
* For the other side of this, see initWaiter.
if (write(pipenum, "", 1) != 1)
bail("could not inform the parent we are past initial setup");
write_log(DEBUG, "=> nsexec container setup");
nl_parse(pipenum, &config);