添加链接
link之家
链接快照平台
  • 输入网页链接,自动生成快照
  • 标签化管理网页链接

runc作为容器的运行时,现在作为独立的项目来进行发展,runc提供一套简单的容器运行环境,包括进程的命名空间、cgroups和文件系统权限等管理的功能,runc是基于oci标准的产物,可以让大家都通过统一的接口来进行运行时的操作。其本质的管理工作也是最主要的几个重要的函数clone,unshare和setns等重要的操作函数。

runc原理流程

runc作为运行时,即在提供了挂载目录、运行权限等运行参数的情况下将容器启动运行,真正的作为一个运行管理的工具使用,一些例如镜像、日志配置等待交互清理功能都完全交给上层应用例如containerd等来管理。

本文就粗略的剖析一下主要的流程原理。

runc的run命令

调用该命令的时候是执行位于run.go文件里面的command。

var runCommand = cli.Command{
	Name:  "run",
	Usage: "create and run a container",
	ArgsUsage: `<container-id>
Where "<container-id>" is your name for the instance of the container that you
are starting. The name you provide for the container instance must be unique on
your host.`,
	Description: `The run command creates an instance of a container for a bundle. The bundle
is a directory with a specification file named "` + specConfig + `" and a root
filesystem.
The specification file includes an args parameter. The args parameter is used
to specify command(s) that get run when the container is started. To change the
command(s) that get executed on start, edit the args parameter of the spec. See
"runc spec --help" for more explanation.`,
	Flags: []cli.Flag{
		cli.StringFlag{
			Name:  "bundle, b",
			Value: "",
			Usage: `path to the root of the bundle directory, defaults to the current directory`,
		cli.StringFlag{
			Name:  "console-socket",
			Value: "",
			Usage: "path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal",
		cli.BoolFlag{
			Name:  "detach, d",
			Usage: "detach from the container's process",
		cli.StringFlag{
			Name:  "pid-file",
			Value: "",
			Usage: "specify the file to write the process id to",
		cli.BoolFlag{
			Name:  "no-subreaper",
			Usage: "disable the use of the subreaper used to reap reparented processes",
		cli.BoolFlag{
			Name:  "no-pivot",
			Usage: "do not use pivot root to jail process inside rootfs.  This should be used whenever the rootfs is on top of a ramdisk",
		cli.BoolFlag{
			Name:  "no-new-keyring",
			Usage: "do not create a new session keyring for the container.  This will cause the container to inherit the calling processes session key",
		cli.IntFlag{
			Name:  "preserve-fds",
			Usage: "Pass N additional file descriptors to the container (stdio + $LISTEN_FDS + N in total)",
	Action: func(context *cli.Context) error {
		if err := checkArgs(context, 1, exactArgs); err != nil {  // 检查输入参数
			return err
		if err := revisePidFile(context); err != nil {
			return err
		spec, err := setupSpec(context)     // 生成运行的配置文件
		if err != nil {
			return err
		status, err := startContainer(context, spec, CT_ACT_RUN, nil)  // 开启container,将RUN标志传入
		if err == nil {
			// exit with the container's exit status so any external supervisor is
			// notified of the exit with the correct exit status.
			os.Exit(status)
		return err

从流程上面看,主要的内容就是先生成运行的配置文件,然后再进行startContainer的函数执行,该函数就是将整个容器运行时的启动与管理功能。

func startContainer(context *cli.Context, spec *specs.Spec, action CtAct, criuOpts *libcontainer.CriuOpts) (int, error) {
	id := context.Args().First()
	if id == "" {
		return -1, errEmptyID
	notifySocket := newNotifySocket(context, os.Getenv("NOTIFY_SOCKET"), id)   // 生成通信的sock
	if notifySocket != nil {
		if err := notifySocket.setupSpec(context, spec); err != nil {
			return -1, err
	container, err := createContainer(context, id, spec)   // 创建container
	if err != nil {
		return -1, err
	if notifySocket != nil {
		if err := notifySocket.setupSocketDirectory(); err != nil {   // 建立socket的通信的目录文件
			return -1, err
		if action == CT_ACT_RUN {
			if err := notifySocket.bindSocket(); err != nil {   // 如果是运行的指令则监听该文件通信
				return -1, err
	// Support on-demand socket activation by passing file descriptors into the container init process.
	listenFDs := []*os.File{}
	if os.Getenv("LISTEN_FDS") != "" {
		listenFDs = activation.Files(false)
	logLevel := "info"
	if context.GlobalBool("debug") {    // 获取日志等级
		logLevel = "debug"
	r := &runner{
		enableSubreaper: !context.Bool("no-subreaper"),
		shouldDestroy:   true,
		container:       container,
		listenFDs:       listenFDs,
		notifySocket:    notifySocket,
		consoleSocket:   context.String("console-socket"),
		detach:          context.Bool("detach"),
		pidFile:         context.String("pid-file"),
		preserveFDs:     context.Int("preserve-fds"),
		action:          action,
		criuOpts:        criuOpts,
		init:            true,
		logLevel:        logLevel,
	return r.run(spec.Process)  // 将任务包装成runner运行

此时就是通过runner的run函数进行进一步工作。首先查看一下createContainer函数看看

func createContainer(context *cli.Context, id string, spec *specs.Spec) (libcontainer.Container, error) {
	rootlessCg, err := shouldUseRootlessCgroupManager(context)
	if err != nil {
		return nil, err
	config, err := specconv.CreateLibcontainerConfig(&specconv.CreateOpts{
		CgroupName:       id,
		UseSystemdCgroup: context.GlobalBool("systemd-cgroup"),
		NoPivotRoot:      context.Bool("no-pivot"),
		NoNewKeyring:     context.Bool("no-new-keyring"),
		Spec:             spec,
		RootlessEUID:     os.Geteuid() != 0,
		RootlessCgroups:  rootlessCg,
	})    // 创建配置信息
	if err != nil {
		return nil, err
	factory, err := loadFactory(context)   // 加载一个创建的方法
	if err != nil {
		return nil, err
	return factory.Create(id, config)    // 通过id和配置文件创建一个container

此时我们查看一下loadFactory的方法。

// New returns a linux based container factory based in the root directory and
// configures the factory with the provided option funcs.
func New(root string, options ...func(*LinuxFactory) error) (Factory, error) {
	if root != "" {
		if err := os.MkdirAll(root, 0o700); err != nil {   // 改变根目录的权限
			return nil, newGenericError(err, SystemError)
	l := &LinuxFactory{
		Root:      root,
		InitPath:  "/proc/self/exe",   // 设置InitPath路径
		InitArgs:  []string{os.Args[0], "init"},  // 输入参数为init
		Validator: validate.New(),
		CriuPath:  "criu",
	if err := Cgroupfs(l); err != nil {  // 初始化cg
		return nil, err
	for _, opt := range options {  // 执行各种配置信息
		if opt == nil {
			continue
		if err := opt(l); err != nil {
			return nil, err
	return l, nil
// loadFactory returns the configured factory instance for execing containers.
func loadFactory(context *cli.Context) (libcontainer.Factory, error) {
	root := context.GlobalString("root")
	abs, err := filepath.Abs(root)   // 获取路径
	if err != nil {
		return nil, err
	// We default to cgroupfs, and can only use systemd if the system is a
	// systemd box.
	cgroupManager := libcontainer.Cgroupfs
	rootlessCg, err := shouldUseRootlessCgroupManager(context)  // 选择对应的cg的信息
	if err != nil {
		return nil, err
	if rootlessCg {
		cgroupManager = libcontainer.RootlessCgroupfs
	if context.GlobalBool("systemd-cgroup") {
		if !systemd.IsRunningSystemd() {
			return nil, errors.New("systemd cgroup flag passed, but systemd support for managing cgroups is not available")
		cgroupManager = libcontainer.SystemdCgroups
		if rootlessCg {
			cgroupManager = libcontainer.RootlessSystemdCgroups
	intelRdtManager := libcontainer.IntelRdtFs   
	// We resolve the paths for {newuidmap,newgidmap} from the context of runc,
	// to avoid doing a path lookup in the nsexec context. TODO: The binary
	// names are not currently configurable.
	newuidmap, err := exec.LookPath("newuidmap")   // 获取当前的用户组相关信息
	if err != nil {
		newuidmap = ""
	newgidmap, err := exec.LookPath("newgidmap")
	if err != nil {
		newgidmap = ""
	return libcontainer.New(abs, cgroupManager, intelRdtManager,
		libcontainer.CriuPath(context.GlobalString("criu")),
		libcontainer.NewuidmapPath(newuidmap),
		libcontainer.NewgidmapPath(newgidmap))  // 生成一个container

从函数流程可知,初始化了一个path为自己,输入参数为init的入口配置信息,接着我们查看Create的内容。

func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, error) {
	if l.Root == "" {
		return nil, newGenericError(fmt.Errorf("invalid root"), ConfigInvalid)
	if err := l.validateID(id); err != nil {  // 检查id是否合法
		return nil, err
	if err := l.Validator.Validate(config); err != nil {
		return nil, newGenericError(err, ConfigInvalid)
	containerRoot, err := securejoin.SecureJoin(l.Root, id)  // 获取容器跟路径
	if err != nil {
		return nil, err
	if _, err := os.Stat(containerRoot); err == nil {   // 检查根文件
		return nil, newGenericError(fmt.Errorf("container with id exists: %v", id), IdInUse)
	} else if !os.IsNotExist(err) {
		return nil, newGenericError(err, SystemError)
	if err := os.MkdirAll(containerRoot, 0o711); err != nil {  // 创建并改变权限
		return nil, newGenericError(err, SystemError)
	if err := os.Chown(containerRoot, unix.Geteuid(), unix.Getegid()); err != nil {  // 改变目录组的信息
		return nil, newGenericError(err, SystemError)
	c := &linuxContainer{
		id:            id,
		root:          containerRoot,
		config:        config,
		initPath:      l.InitPath,
		initArgs:      l.InitArgs,
		criuPath:      l.CriuPath,
		newuidmapPath: l.NewuidmapPath,
		newgidmapPath: l.NewgidmapPath,
		cgroupManager: l.NewCgroupsManager(config.Cgroups, nil), 
	}  			// 实例化生成一个container
	if l.NewIntelRdtManager != nil {
		c.intelRdtManager = l.NewIntelRdtManager(config, id, "")
	c.state = &stoppedState{c: c}
	return c, nil

通过createContainer函数,就进行了一系列文件权限与输入参数的前置工作。

现在我们查看一下runner的run方法是如果工作的。

func (r *runner) run(config *specs.Process) (int, error) {
	var err error
	defer func() {
		if err != nil {
			r.destroy()
	}()
	if err = r.checkTerminal(config); err != nil {   // 检查终端
		return -1, err
	process, err := newProcess(*config, r.init, r.logLevel)   // 生成一个Process的实例
	if err != nil {
		return -1, err
	if len(r.listenFDs) > 0 {
		process.Env = append(process.Env, "LISTEN_FDS="+strconv.Itoa(len(r.listenFDs)), "LISTEN_PID=1")
		process.ExtraFiles = append(process.ExtraFiles, r.listenFDs...)  // 通过环境变量传递监听信息
	baseFd := 3 + len(process.ExtraFiles)
	for i := baseFd; i < baseFd+r.preserveFDs; i++ {
		_, err = os.Stat("/proc/self/fd/" + strconv.Itoa(i))
		if err != nil {
			return -1, errors.Wrapf(err, "please check that preserved-fd %d (of %d) is present", i-baseFd, r.preserveFDs)
		process.ExtraFiles = append(process.ExtraFiles, os.NewFile(uintptr(i), "PreserveFD:"+strconv.Itoa(i)))
	rootuid, err := r.container.Config().HostRootUID()  // 获取uid
	if err != nil {
		return -1, err
	rootgid, err := r.container.Config().HostRootGID()   // 获取gid
	if err != nil {
		return -1, err
	detach := r.detach || (r.action == CT_ACT_CREATE)
	// Setting up IO is a two stage process. We need to modify process to deal
	// with detaching containers, and then we get a tty after the container has
	// started.
	handler := newSignalHandler(r.enableSubreaper, r.notifySocket)
	tty, err := setupIO(process, rootuid, rootgid, config.Terminal, detach, r.consoleSocket)  // 建立IO相关的配置 通过原始套接字进行通信
	if err != nil {
		return -1, err
	defer tty.Close()
	switch r.action {
	case CT_ACT_CREATE:
		err = r.container.Start(process)
	case CT_ACT_RESTORE:
		err = r.container.Restore(process, r.criuOpts)
	case CT_ACT_RUN:
		err = r.container.Run(process)  // 此时传入的是Run信息 故执行该路径
	default:
		panic("Unknown action")
	if err != nil {
		return -1, err
	if err = tty.waitConsole(); err != nil {  // 等待输入信息
		r.terminate(process)
		return -1, err
	if err = tty.ClosePostStart(); err != nil {  // 关闭信息
		r.terminate(process)
		return -1, err
	if r.pidFile != "" {
		if err = createPidFile(r.pidFile, process); err != nil {  // 创建Pid文件
			r.terminate(process)
			return -1, err
	status, err := handler.forward(process, tty, detach)  
	if err != nil {
		r.terminate(process)
	if detach {
		return 0, nil
	if err == nil {
		r.destroy()
	return status, err

所有重要的逻辑信息就都放在了container的Run方法中执行。

func (c *linuxContainer) Run(process *Process) error {
	if err := c.Start(process); err != nil {  // 开始执行
		return err
	if process.Init {      // 在run 的模式下该值为true
		return c.exec()  		// 执行加载
	return nil
func (c *linuxContainer) Start(process *Process) error {
	c.m.Lock()
	defer c.m.Unlock()
	if c.config.Cgroups.Resources.SkipDevices {
		return newGenericError(errors.New("can't start container with SkipDevices set"), ConfigInvalid)
	if process.Init {
		if err := c.createExecFifo(); err != nil {  // 创建执行文件的io信息
			return err
	if err := c.start(process); err != nil {   // 开始执行  
		if process.Init {
			c.deleteExecFifo()
		return err
	return nil

此时继续查看linuxContainer的start方法。

func (c *linuxContainer) start(process *Process) (retErr error) {
	parent, err := c.newParentProcess(process)   // 生成一个parentProcess
	if err != nil {
		return newSystemErrorWithCause(err, "creating new parent process")
	logsDone := parent.forwardChildLogs()  
	if logsDone != nil {
		defer func() {
			// Wait for log forwarder to finish. This depends on
			// runc init closing the _LIBCONTAINER_LOGPIPE log fd.
			err := <-logsDone
			if err != nil && retErr == nil {
				retErr = newSystemErrorWithCause(err, "forwarding init logs")
		}()
	if err := parent.start(); err != nil {   // 开始执行
		return newSystemErrorWithCause(err, "starting container process")
	if process.Init {
		c.fifo.Close()   // 执行完成之后执行钩子回调
		if c.config.Hooks != nil {
			s, err := c.currentOCIState()
			if err != nil {
				return err
			if err := c.config.Hooks[configs.Poststart].RunHooks(s); err != nil {
				if err := ignoreTerminateErrors(parent.terminate()); err != nil {
					logrus.Warn(errorsf.Wrapf(err, "Running Poststart hook"))
				return err
	return nil

首先是通过生成一个newParentProcess来生成一个父的process来进行启动。

func (c *linuxContainer) newParentProcess(p *Process) (parentProcess, error) {
	parentInitPipe, childInitPipe, err := utils.NewSockPair("init")  // 通过init的sock创建通信
	if err != nil {
		return nil, newSystemErrorWithCause(err, "creating new init pipe")
	messageSockPair := filePair{parentInitPipe, childInitPipe}
	parentLogPipe, childLogPipe, err := os.Pipe()  // 创建双工管道
	if err != nil {
		return nil, fmt.Errorf("Unable to create the log pipe:  %s", err)
	logFilePair := filePair{parentLogPipe, childLogPipe}
	cmd := c.commandTemplate(p, childInitPipe, childLogPipe)  // 通过模板来
	if !p.Init {
		return c.newSetnsProcess(p, cmd, messageSockPair, logFilePair)
	// We only set up fifoFd if we're not doing a `runc exec`. The historic
	// reason for this is that previously we would pass a dirfd that allowed
	// for container rootfs escape (and not doing it in `runc exec` avoided
	// that problem), but we no longer do that. However, there's no need to do
	// this for `runc exec` so we just keep it this way to be safe.
	if err := c.includeExecFifo(cmd); err != nil {
		return nil, newSystemErrorWithCause(err, "including execfifo in cmd.Exec setup")
	return c.newInitProcess(p, cmd, messageSockPair, logFilePair)  // 创建一个initProcess的process
func (c *linuxContainer) commandTemplate(p *Process, childInitPipe *os.File, childLogPipe *os.File) *exec.Cmd {
	cmd := exec.Command(c.initPath, c.initArgs[1:]...)   // 生成执行的命令 该命令就是runc init执行
	cmd.Args[0] = c.initArgs[0]
	cmd.Stdin = p.Stdin
	cmd.Stdout = p.Stdout
	cmd.Stderr = p.Stderr
	cmd.Dir = c.config.Rootfs
	if cmd.SysProcAttr == nil {
		cmd.SysProcAttr = &unix.SysProcAttr{}
	cmd.Env = append(cmd.Env, "GOMAXPROCS="+os.Getenv("GOMAXPROCS"))  // 设置相关的环境变量
	cmd.ExtraFiles = append(cmd.ExtraFiles, p.ExtraFiles...)
	if p.ConsoleSocket != nil {
		cmd.ExtraFiles = append(cmd.ExtraFiles, p.ConsoleSocket)
		cmd.Env = append(cmd.Env,
			"_LIBCONTAINER_CONSOLE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1),
	cmd.ExtraFiles = append(cmd.ExtraFiles, childInitPipe)
	cmd.Env = append(cmd.Env,
		"_LIBCONTAINER_INITPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1),
		"_LIBCONTAINER_STATEDIR="+c.root,
	cmd.ExtraFiles = append(cmd.ExtraFiles, childLogPipe)
	cmd.Env = append(cmd.Env,
		"_LIBCONTAINER_LOGPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1),
		"_LIBCONTAINER_LOGLEVEL="+p.LogLevel,
	// NOTE: when running a container with no PID namespace and the parent process spawning the container is
	// PID1 the pdeathsig is being delivered to the container's init process by the kernel for some reason
	// even with the parent still running.
	if c.config.ParentDeathSignal > 0 {
		cmd.SysProcAttr.Pdeathsig = unix.Signal(c.config.ParentDeathSignal)
	return cmd
func (c *linuxContainer) newParentProcess(p *Process) (parentProcess, error) {
	parentInitPipe, childInitPipe, err := utils.NewSockPair("init")  // 通过init的sock创建通信
	if err != nil {
		return nil, newSystemErrorWithCause(err, "creating new init pipe")
	messageSockPair := filePair{parentInitPipe, childInitPipe}
	parentLogPipe, childLogPipe, err := os.Pipe()  // 创建双工管道
	if err != nil {
		return nil, fmt.Errorf("Unable to create the log pipe:  %s", err)
	logFilePair := filePair{parentLogPipe, childLogPipe}
	cmd := c.commandTemplate(p, childInitPipe, childLogPipe)  // 通过模板来
	if !p.Init {
		return c.newSetnsProcess(p, cmd, messageSockPair, logFilePair)
	// We only set up fifoFd if we're not doing a `runc exec`. The historic
	// reason for this is that previously we would pass a dirfd that allowed
	// for container rootfs escape (and not doing it in `runc exec` avoided
	// that problem), but we no longer do that. However, there's no need to do
	// this for `runc exec` so we just keep it this way to be safe.
	if err := c.includeExecFifo(cmd); err != nil {
		return nil, newSystemErrorWithCause(err, "including execfifo in cmd.Exec setup")
	return c.newInitProcess(p, cmd, messageSockPair, logFilePair)  // 创建一个initProcess的process
func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, messageSockPair, logFilePair filePair) (*initProcess, error) {
	cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard))   // 获取环境变量
	nsMaps := make(map[configs.NamespaceType]string)
	for _, ns := range c.config.Namespaces {
		if ns.Path != "" {
			nsMaps[ns.Type] = ns.Path
	_, sharePidns := nsMaps[configs.NEWPID]  
	data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps)  // 获取通信的数据
	if err != nil {
		return nil, err
	init := &initProcess{
		cmd:             cmd,
		messageSockPair: messageSockPair,
		logFilePair:     logFilePair,
		manager:         c.cgroupManager,
		intelRdtManager: c.intelRdtManager,
		config:          c.newInitConfig(p),
		container:       c,
		process:         p,
		bootstrapData:   data,
		sharePidns:      sharePidns,
	c.initProcess = init  // 保存initProcess
	return init, nil

初始化完成之后,就开始进入initProcess的start方法。

func (p *initProcess) start() (retErr error) {
	defer p.messageSockPair.parent.Close() //nolint: errcheck
	err := p.cmd.Start()      // 开始启动runc init的进行
	p.process.ops = p
	// close the write-side of the pipes (controlled by child)
	_ = p.messageSockPair.child.Close()   // 执行完成之后关闭信息的管道
	_ = p.logFilePair.child.Close()
	if err != nil {
		p.process.ops = nil
		return newSystemErrorWithCause(err, "starting init process command")
	waitInit := initWaiter(p.messageSockPair.parent)  //  等待数据往管道写
	defer func() {
		if retErr != nil {
			// Find out if init is killed by the kernel's OOM killer.
			// Get the count before killing init as otherwise cgroup
			// might be removed by systemd.
			oom, err := p.manager.OOMKillCount()
			if err != nil {
				logrus.WithError(err).Warn("unable to get oom kill count")
			} else if oom > 0 {
				// Does not matter what the particular error was,
				// its cause is most probably OOM, so report that.
				const oomError = "container init was OOM-killed (memory limit too low?)"
				if logrus.GetLevel() >= logrus.DebugLevel {
					// Only show the original error if debug is set,
					// as it is not generally very useful.
					retErr = newSystemErrorWithCause(retErr, oomError)
				} else {
					retErr = newSystemError(errors.New(oomError))
			werr := <-waitInit
			if werr != nil {
				logrus.WithError(werr).Warn()
			// Terminate the process to ensure we can remove cgroups.
			if err := ignoreTerminateErrors(p.terminate()); err != nil {
				logrus.WithError(err).Warn("unable to terminate initProcess")
			_ = p.manager.Destroy()
			if p.intelRdtManager != nil {
				_ = p.intelRdtManager.Destroy()
	}()
	// Do this before syncing with child so that no children can escape the
	// cgroup. We don't need to worry about not doing this and not being root
	// because we'd be using the rootless cgroup manager in that case.
	if err := p.manager.Apply(p.pid()); err != nil {  // 
		return newSystemErrorWithCause(err, "applying cgroup configuration for process")
	if p.intelRdtManager != nil {
		if err := p.intelRdtManager.Apply(p.pid()); err != nil {
			return newSystemErrorWithCause(err, "applying Intel RDT configuration for process")
	if _, err := io.Copy(p.messageSockPair.parent, p.bootstrapData); err != nil {
		return newSystemErrorWithCause(err, "copying bootstrap data to pipe")
	err = <-waitInit
	if err != nil {
		return err
	childPid, err := p.getChildPid()   // 获取子进程的pid
	if err != nil {
		return newSystemErrorWithCause(err, "getting the final child's pid from pipe")
	// Save the standard descriptor names before the container process
	// can potentially move them (e.g., via dup2()).  If we don't do this now,
	// we won't know at checkpoint time which file descriptor to look up.
	fds, err := getPipeFds(childPid)  // 获取管道的fd
	if err != nil {
		return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", childPid)
	p.setExternalDescriptors(fds)
	// Now it's time to setup cgroup namesapce
	if p.config.Config.Namespaces.Contains(configs.NEWCGROUP) && p.config.Config.Namespaces.PathOf(configs.NEWCGROUP) == "" {
		if _, err := p.messageSockPair.parent.Write([]byte{createCgroupns}); err != nil {
			return newSystemErrorWithCause(err, "sending synchronization value to init process")
	// Wait for our first child to exit
	if err := p.waitForChildExit(childPid); err != nil {  // 等待子进程结束
		return newSystemErrorWithCause(err, "waiting for our first child to exit")
	if err := p.createNetworkInterfaces(); err != nil {  // 创建网络接口
		return newSystemErrorWithCause(err, "creating network interfaces")
	if err := p.updateSpecState(); err != nil {   // 更新spec状态
		return newSystemErrorWithCause(err, "updating the spec state")
	if err := p.sendConfig(); err != nil {   // 发送配置文件信息
		return newSystemErrorWithCause(err, "sending config to init process")
	var (
		sentRun    bool
		sentResume bool
	ierr := parseSync(p.messageSockPair.parent, func(sync *syncT) error {  //  获取同步的数据信息
		switch sync.Type {
		case procReady:
			// set rlimits, this has to be done here because we lose permissions
			// to raise the limits once we enter a user-namespace
			if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {  // 设置资源限制
				return newSystemErrorWithCause(err, "setting rlimits for ready process")
			// call prestart and CreateRuntime hooks
			if !p.config.Config.Namespaces.Contains(configs.NEWNS) {  
				// Setup cgroup before the hook, so that the prestart and CreateRuntime hook could apply cgroup permissions.
				if err := p.manager.Set(p.config.Config.Cgroups.Resources); err != nil {
					return newSystemErrorWithCause(err, "setting cgroup config for ready process")
				if p.intelRdtManager != nil {
					if err := p.intelRdtManager.Set(p.config.Config); err != nil {
						return newSystemErrorWithCause(err, "setting Intel RDT config for ready process")
				if p.config.Config.Hooks != nil {
					s, err := p.container.currentOCIState()
					if err != nil {
						return err
					// initProcessStartTime hasn't been set yet.
					s.Pid = p.cmd.Process.Pid
					s.Status = specs.StateCreating
					hooks := p.config.Config.Hooks
					if err := hooks[configs.Prestart].RunHooks(s); err != nil {
						return err
					if err := hooks[configs.CreateRuntime].RunHooks(s); err != nil {
						return err
			// generate a timestamp indicating when the container was started
			p.container.created = time.Now().UTC()
			p.container.state = &createdState{
				c: p.container,    // 创建成功更新数据
			// NOTE: If the procRun state has been synced and the
			// runc-create process has been killed for some reason,
			// the runc-init[2:stage] process will be leaky. And
			// the runc command also fails to parse root directory
			// because the container doesn't have state.json.
			// In order to cleanup the runc-init[2:stage] by
			// runc-delete/stop, we should store the status before
			// procRun sync.
			state, uerr := p.container.updateState(p)  // 更新状态
			if uerr != nil {
				return newSystemErrorWithCause(err, "store init state")
			p.container.initProcessStartTime = state.InitProcessStartTime
			// Sync with child.
			if err := writeSync(p.messageSockPair.parent, procRun); err != nil {  // 发送数据到子进程 让子进程继续
				return newSystemErrorWithCause(err, "writing syncT 'run'")
			sentRun = true
		case procHooks:
			// Setup cgroup before prestart hook, so that the prestart hook could apply cgroup permissions.
			if err := p.manager.Set(p.config.Config.Cgroups.Resources); err != nil {   
				return newSystemErrorWithCause(err, "setting cgroup config for procHooks process")
			if p.intelRdtManager != nil {
				if err := p.intelRdtManager.Set(p.config.Config); err != nil {
					return newSystemErrorWithCause(err, "setting Intel RDT config for procHooks process")
			if p.config.Config.Hooks != nil {   // 执行子进程的钩子函数
				s, err := p.container.currentOCIState()
				if err != nil {
					return err
				// initProcessStartTime hasn't been set yet.
				s.Pid = p.cmd.Process.Pid
				s.Status = specs.StateCreating
				hooks := p.config.Config.Hooks
				if err := hooks[configs.Prestart].RunHooks(s); err != nil {
					return err
				if err := hooks[configs.CreateRuntime].RunHooks(s); err != nil {
					return err
			// Sync with child.
			if err := writeSync(p.messageSockPair.parent, procResume); err != nil {   // 将数据写入个子进程
				return newSystemErrorWithCause(err, "writing syncT 'resume'")
			sentResume = true
		default:
			return newSystemError(errors.New("invalid JSON payload from child"))
		return nil
	if !sentRun {
		return newSystemErrorWithCause(ierr, "container init")
	if p.config.Config.Namespaces.Contains(configs.NEWNS) && !sentResume {
		return newSystemError(errors.New("could not synchronise after executing prestart and CreateRuntime hooks with container process"))
	if err := unix.Shutdown(int(p.messageSockPair.parent.Fd()), unix.SHUT_WR); err != nil {
		return newSystemErrorWithCause(err, "shutting down init pipe")
	// Must be done after Shutdown so the child will exit and we can wait for it.
	if ierr != nil {
		_, _ = p.wait()   
		return ierr
	return nil

整个函数就行通过与子进程的每次通信来控制进程的创建。

此时首先子进程执行的是runc init,执行的时候在包的首行里面,导入了nsenter包

_ "github.com/opencontainers/runc/libcontainer/nsenter" # 该行代码会在 nsenter.go中导入的时候执行init导入,执行nsexec()函数 // +build linux,!gccgo package nsenter #cgo CFLAGS: -Wall extern void nsexec(); void __attribute__((constructor)) init(void) { nsexec(); import "C"

此时,首先执行的就是nsexec.c文件中的nsexec()函数,该函数的逻辑相对就是将创建流程分成两个子进程进行分别的进行,从而完成隔离环境的初始化。

void nsexec(void)
	int pipenum;
	jmp_buf env;
	int sync_child_pipe[2], sync_grandchild_pipe[2];
	struct nlconfig_t config = { 0 };
	 * Setup a pipe to send logs to the parent. This should happen
	 * first, because bail will use that pipe.
	setup_logpipe();  // 获取日志的管道
	 * If we don't have an init pipe, just return to the go routine.
	 * We'll only get an init pipe for start or exec.
	pipenum = initpipe();    // 初始化一个管道
	if (pipenum == -1)
		return;
	 * We need to re-exec if we are not in a cloned binary. This is necessary
	 * to ensure that containers won't be able to access the host binary
	 * through /proc/self/exe. See CVE-2019-5736.
	if (ensure_cloned_binary() < 0)
		bail("could not ensure we are a cloned binary");
	 * Inform the parent we're past initial setup.
	 * For the other side of this, see initWaiter.
	if (write(pipenum, "", 1) != 1)
		bail("could not inform the parent we are past initial setup");
	write_log(DEBUG, "=> nsexec container setup");
	/* Parse all of the netlink configuration. */
	nl_parse(pipenum, &config);   // 解析传入的配置文件
	/* Set oom_score_adj. This has to be done before !dumpable because
	 * /proc/self/oom_score_adj is not writeable unless you're an privileged
	 * user (if !dumpable is set). All children inherit their parent's
	 * oom_score_adj value on fork(2) so this will always be propagated
	 * properly.
	update_oom_score_adj(config.oom_score_adj, config.oom_score_adj_len);  // 设置OOM配置
	 * Make the process non-dumpable, to avoid various race conditions that
	 * could cause processes in namespaces we're joining to access host
	 * resources (or potentially execute code).
	 * However, if the number of namespaces we are joining is 0, we are not
	 * going to be switching to a different security context. Thus setting
	 * ourselves to be non-dumpable only breaks things (like rootless
	 * containers), which is the recommendation from the kernel folks.
	if (config.namespaces) {   // 设置命令空间
		write_log(DEBUG, "set process as non-dumpable");
		if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0)
			bail("failed to set process as non-dumpable");
	/* Pipe so we can tell the child when we've finished setting up. */
	if (socketpair(AF_LOCAL, SOCK_STREAM, 0, sync_child_pipe) < 0)  // 发送初始化完成信号
		bail("failed to setup sync pipe between parent and child");
	 * We need a new socketpair to sync with grandchild so we don't have
	 * race condition with child.
	if (socketpair(AF_LOCAL, SOCK_STREAM, 0, sync_grandchild_pipe) < 0)
		bail("failed to setup sync pipe between parent and grandchild");
	/* TODO: Currently we aren't dealing with child deaths properly. */
	 * Okay, so this is quite annoying.
	 * In order for this unsharing code to be more extensible we need to split
	 * up unshare(CLONE_NEWUSER) and clone() in various ways. The ideal case
	 * would be if we did clone(CLONE_NEWUSER) and the other namespaces
	 * separately, but because of SELinux issues we cannot really do that. But
	 * we cannot just dump the namespace flags into clone(...) because several
	 * usecases (such as rootless containers) require more granularity around
	 * the namespace setup. In addition, some older kernels had issues where
	 * CLONE_NEWUSER wasn't handled before other namespaces (but we cannot
	 * handle this while also dealing with SELinux so we choose SELinux support
	 * over broken kernel support).
	 * However, if we unshare(2) the user namespace *before* we clone(2), then
	 * all hell breaks loose.
	 * The parent no longer has permissions to do many things (unshare(2) drops
	 * all capabilities in your old namespace), and the container cannot be set
	 * up to have more than one {uid,gid} mapping. This is obviously less than
	 * ideal. In order to fix this, we have to first clone(2) and then unshare.
	 * Unfortunately, it's not as simple as that. We have to fork to enter the
	 * PID namespace (the PID namespace only applies to children). Since we'll
	 * have to double-fork, this clone_parent() call won't be able to get the
	 * PID of the _actual_ init process (without doing more synchronisation than
	 * I can deal with at the moment). So we'll just get the parent to send it
	 * for us, the only job of this process is to update
	 * /proc/pid/{setgroups,uid_map,gid_map}.
	 * And as a result of the above, we also need to setns(2) in the first child
	 * because if we join a PID namespace in the topmost parent then our child
	 * will be in that namespace (and it will not be able to give us a PID value
	 * that makes sense without resorting to sending things with cmsg).
	 * This also deals with an older issue caused by dumping cloneflags into
	 * clone(2): On old kernels, CLONE_PARENT didn't work with CLONE_NEWPID, so
	 * we have to unshare(2) before clone(2) in order to do this. This was fixed
	 * in upstream commit 1f7f4dde5c945f41a7abc2285be43d918029ecc5, and was
	 * introduced by 40a0d32d1eaffe6aac7324ca92604b6b3977eb0e. As far as we're
	 * aware, the last mainline kernel which had this bug was Linux 3.12.
	 * However, we cannot comment on which kernels the broken patch was
	 * backported to.
	 * -- Aleksa "what has my life come to?" Sarai
	current_stage = setjmp(env);   // 通过env来进行子进程数据之间的跳转
	switch (current_stage) {
		 * Stage 0: We're in the parent. Our job is just to create a new child
		 *          (stage 1: STAGE_CHILD) process and write its uid_map and
		 *          gid_map. That process will go on to create a new process, then
		 *          it will send us its PID which we will send to the bootstrap
		 *          process.
	case STAGE_PARENT:{
			int len;
			pid_t stage1_pid = -1, stage2_pid = -1;
			bool stage1_complete, stage2_complete;
			/* For debugging. */
			prctl(PR_SET_NAME, (unsigned long)"runc:[0:PARENT]", 0, 0, 0);
			write_log(DEBUG, "~> nsexec stage-0");
			/* Start the process of getting a container. */
			write_log(DEBUG, "spawn stage-1");
			stage1_pid = clone_parent(&env, STAGE_CHILD);   // 生成一个子进程
			if (stage1_pid < 0)
				bail("unable to spawn stage-1");
			syncfd = sync_child_pipe[1];
			close(sync_child_pipe[0]);
			 * State machine for synchronisation with the children. We only
			 * return once both the child and grandchild are ready.
			write_log(DEBUG, "-> stage-1 synchronisation loop");
			stage1_complete = false;
			while (!stage1_complete) {
				enum sync_t s;
				if (read(syncfd, &s, sizeof(s)) != sizeof(s))  // 通过fd获取信息
					bail("failed to sync with stage-1: next state");
				switch (s) {
				case SYNC_USERMAP_PLS:
					write_log(DEBUG, "stage-1 requested userns mappings");
					 * Enable setgroups(2) if we've been asked to. But we also
					 * have to explicitly disable setgroups(2) if we're
					 * creating a rootless container for single-entry mapping.
					 * i.e. config.is_setgroup == false.
					 * (this is required since Linux 3.19).
					 * For rootless multi-entry mapping, config.is_setgroup shall be true and
					 * newuidmap/newgidmap shall be used.
					if (config.is_rootless_euid && !config.is_setgroup)
						update_setgroups(stage1_pid, SETGROUPS_DENY);
					/* Set up mappings. */
					update_uidmap(config.uidmappath, stage1_pid, config.uidmap, config.uidmap_len);  // 更新用户组信息
					update_gidmap(config.gidmappath, stage1_pid, config.gidmap, config.gidmap_len);
					s = SYNC_USERMAP_ACK;
					if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {  // 设置成功之后发送ack给主进程
						sane_kill(stage1_pid, SIGKILL);
						sane_kill(stage2_pid, SIGKILL);
						bail("failed to sync with stage-1: write(SYNC_USERMAP_ACK)");
					break;
				case SYNC_RECVPID_PLS:
					write_log(DEBUG, "stage-1 requested pid to be forwarded");
					/* Get the stage-2 pid. */
					if (read(syncfd, &stage2_pid, sizeof(stage2_pid)) != sizeof(stage2_pid)) {  // 读取pid信息
						sane_kill(stage1_pid, SIGKILL);
						sane_kill(stage2_pid, SIGKILL);
						bail("failed to sync with stage-1: read(stage2_pid)");
					/* Send ACK. */
					s = SYNC_RECVPID_ACK;   // 发送ack信息
					if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {   // 写入数据
						sane_kill(stage1_pid, SIGKILL);
						sane_kill(stage2_pid, SIGKILL);
						bail("failed to sync with stage-1: write(SYNC_RECVPID_ACK)");
					 * Send both the stage-1 and stage-2 pids back to runc.
					 * runc needs the stage-2 to continue process management,
					 * but because stage-1 was spawned with CLONE_PARENT we
					 * cannot reap it within stage-0 and thus we need to ask
					 * runc to reap the zombie for us.
					write_log(DEBUG, "forward stage-1 (%d) and stage-2 (%d) pids to runc",
						  stage1_pid, stage2_pid);
					len =
					    dprintf(pipenum, "{\"stage1_pid\":%d,\"stage2_pid\":%d}\n", stage1_pid,
						    stage2_pid);
					if (len < 0) {
						sane_kill(stage1_pid, SIGKILL);
						sane_kill(stage2_pid, SIGKILL);
						bail("failed to sync with runc: write(pid-JSON)");
					break;
				case SYNC_CHILD_FINISH:
					write_log(DEBUG, "stage-1 complete");  // 此时stage-1完成
					stage1_complete = true;
					break;
				default:
					bail("unexpected sync value: %u", s);
			write_log(DEBUG, "<- stage-1 synchronisation loop");
			/* Now sync with grandchild. */
			syncfd = sync_grandchild_pipe[1];
			close(sync_grandchild_pipe[0]);
			write_log(DEBUG, "-> stage-2 synchronisation loop");
			stage2_complete = false;
			while (!stage2_complete) {   // 进入第二阶段
				enum sync_t s;
				write_log(DEBUG, "signalling stage-2 to run");
				s = SYNC_GRANDCHILD;
				if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
					sane_kill(stage2_pid, SIGKILL);
					bail("failed to sync with child: write(SYNC_GRANDCHILD)");
				if (read(syncfd, &s, sizeof(s)) != sizeof(s))
					bail("failed to sync with child: next state");
				switch (s) {
				case SYNC_CHILD_FINISH:
					write_log(DEBUG, "stage-2 complete");
					stage2_complete = true;
					break;
				default:
					bail("unexpected sync value: %u", s);
			write_log(DEBUG, "<- stage-2 synchronisation loop");
			write_log(DEBUG, "<~ nsexec stage-0");
			exit(0);
		break;
		 * Stage 1: We're in the first child process. Our job is to join any
		 *          provided namespaces in the netlink payload and unshare all of
		 *          the requested namespaces. If we've been asked to CLONE_NEWUSER,
		 *          we will ask our parent (stage 0) to set up our user mappings
		 *          for us. Then, we create a new child (stage 2: STAGE_INIT) for
		 *          PID namespace. We then send the child's PID to our parent
		 *          (stage 0).
	case STAGE_CHILD:{
			pid_t stage2_pid = -1;
			enum sync_t s;
			/* We're in a child and thus need to tell the parent if we die. */
			syncfd = sync_child_pipe[0];
			close(sync_child_pipe[1]);
			/* For debugging. */
			prctl(PR_SET_NAME, (unsigned long)"runc:[1:CHILD]", 0, 0, 0);
			write_log(DEBUG, "~> nsexec stage-1");
			 * We need to setns first. We cannot do this earlier (in stage 0)
			 * because of the fact that we forked to get here (the PID of
			 * [stage 2: STAGE_INIT]) would be meaningless). We could send it
			 * using cmsg(3) but that's just annoying.
			if (config.namespaces)
				join_namespaces(config.namespaces);
			 * Deal with user namespaces first. They are quite special, as they
			 * affect our ability to unshare other namespaces and are used as
			 * context for privilege checks.
			 * We don't unshare all namespaces in one go. The reason for this
			 * is that, while the kernel documentation may claim otherwise,
			 * there are certain cases where unsharing all namespaces at once
			 * will result in namespace objects being owned incorrectly.
			 * Ideally we should just fix these kernel bugs, but it's better to
			 * be safe than sorry, and fix them separately.
			 * A specific case of this is that the SELinux label of the
			 * internal kern-mount that mqueue uses will be incorrect if the
			 * UTS namespace is cloned before the USER namespace is mapped.
			 * I've also heard of similar problems with the network namespace
			 * in some scenarios. This also mirrors how LXC deals with this
			 * problem.
			if (config.cloneflags & CLONE_NEWUSER) {
				write_log(DEBUG, "unshare user namespace");
				if (unshare(CLONE_NEWUSER) < 0)
					bail("failed to unshare user namespace");
				config.cloneflags &= ~CLONE_NEWUSER;
				 * We need to set ourselves as dumpable temporarily so that the
				 * parent process can write to our procfs files.
				if (config.namespaces) {
					write_log(DEBUG, "temporarily set process as dumpable");
					if (prctl(PR_SET_DUMPABLE, 1, 0, 0, 0) < 0)
						bail("failed to temporarily set process as dumpable");
				 * We don't have the privileges to do any mapping here (see the
				 * clone_parent rant). So signal stage-0 to do the mapping for
				 * us.
				write_log(DEBUG, "request stage-0 to map user namespace");
				s = SYNC_USERMAP_PLS;
				if (write(syncfd, &s, sizeof(s)) != sizeof(s))
					bail("failed to sync with parent: write(SYNC_USERMAP_PLS)");
				/* ... wait for mapping ... */
				write_log(DEBUG, "request stage-0 to map user namespace");
				if (read(syncfd, &s, sizeof(s)) != sizeof(s))
					bail("failed to sync with parent: read(SYNC_USERMAP_ACK)");
				if (s != SYNC_USERMAP_ACK)
					bail("failed to sync with parent: SYNC_USERMAP_ACK: got %u", s);
				/* Revert temporary re-dumpable setting. */
				if (config.namespaces) {
					write_log(DEBUG, "re-set process as non-dumpable");
					if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0)
						bail("failed to re-set process as non-dumpable");
				/* Become root in the namespace proper. */
				if (setresuid(0, 0, 0) < 0)
					bail("failed to become root in user namespace");
			 * Unshare all of the namespaces. Now, it should be noted that this
			 * ordering might break in the future (especially with rootless
			 * containers). But for now, it's not possible to split this into
			 * CLONE_NEWUSER + [the rest] because of some RHEL SELinux issues.
			 * Note that we don't merge this with clone() because there were
			 * some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID)
			 * was broken, so we'll just do it the long way anyway.
			write_log(DEBUG, "unshare remaining namespace (except cgroupns)");
			if (unshare(config.cloneflags & ~CLONE_NEWCGROUP) < 0)   // 设置隔离空间
				bail("failed to unshare remaining namespaces (except cgroupns)");
			 * TODO: What about non-namespace clone flags that we're dropping here?
			 * We fork again because of PID namespace, setns(2) or unshare(2) don't
			 * change the PID namespace of the calling process, because doing so
			 * would change the caller's idea of its own PID (as reported by getpid()),
			 * which would break many applications and libraries, so we must fork
			 * to actually enter the new PID namespace.
			write_log(DEBUG, "spawn stage-2");
			stage2_pid = clone_parent(&env, STAGE_INIT);  // 创建一个子进程来进行下一步的操作
			if (stage2_pid < 0)
				bail("unable to spawn stage-2");
			/* Send the child to our parent, which knows what it's doing. */
			write_log(DEBUG, "request stage-0 to forward stage-2 pid (%d)", stage2_pid);
			s = SYNC_RECVPID_PLS;
			if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
				sane_kill(stage2_pid, SIGKILL);
				bail("failed to sync with parent: write(SYNC_RECVPID_PLS)");
			if (write(syncfd, &stage2_pid, sizeof(stage2_pid)) != sizeof(stage2_pid)) {
				sane_kill(stage2_pid, SIGKILL);
				bail("failed to sync with parent: write(stage2_pid)");
			/* ... wait for parent to get the pid ... */
			if (read(syncfd, &s, sizeof(s)) != sizeof(s)) {
				sane_kill(stage2_pid, SIGKILL);
				bail("failed to sync with parent: read(SYNC_RECVPID_ACK)");
			if (s != SYNC_RECVPID_ACK) {
				sane_kill(stage2_pid, SIGKILL);
				bail("failed to sync with parent: SYNC_RECVPID_ACK: got %u", s);
			write_log(DEBUG, "signal completion to stage-0");
			s = SYNC_CHILD_FINISH;
			if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
				sane_kill(stage2_pid, SIGKILL);
				bail("failed to sync with parent: write(SYNC_CHILD_FINISH)");
			/* Our work is done. [Stage 2: STAGE_INIT] is doing the rest of the work. */
			write_log(DEBUG, "<~ nsexec stage-1");
			exit(0);
		break;
		 * Stage 2: We're the final child process, and the only process that will
		 *          actually return to the Go runtime. Our job is to just do the
		 *          final cleanup steps and then return to the Go runtime to allow
		 *          init_linux.go to run.
	case STAGE_INIT:{
			 * We're inside the child now, having jumped from the
			 * start_child() code after forking in the parent.
			enum sync_t s;
			/* We're in a child and thus need to tell the parent if we die. */
			syncfd = sync_grandchild_pipe[0];
			close(sync_grandchild_pipe[1]);
			close(sync_child_pipe[0]);
			close(sync_child_pipe[1]);
			/* For debugging. */
			prctl(PR_SET_NAME, (unsigned long)"runc:[2:INIT]", 0, 0, 0);
			write_log(DEBUG, "~> nsexec stage-2");
			if (read(syncfd, &s, sizeof(s)) != sizeof(s))
				bail("failed to sync with parent: read(SYNC_GRANDCHILD)");
			if (s != SYNC_GRANDCHILD)
				bail("failed to sync with parent: SYNC_GRANDCHILD: got %u", s);
			if (setsid() < 0)
				bail("setsid failed");
			if (setuid(0) < 0)
				bail("setuid failed");
			if (setgid(0) < 0)
				bail("setgid failed"); // 检查刚刚创建的数据是否正确
			if (!config.is_rootless_euid && config.is_setgroup) {
				if (setgroups(0, NULL) < 0)  // 设置groups
					bail("setgroups failed");
			 * Wait until our topmost parent has finished cgroup setup in
			 * p.manager.Apply().
			 * TODO(cyphar): Check if this code is actually needed because we
			 *               should be in the cgroup even from stage-0, so
			 *               waiting until now might not make sense.
			if (config.cloneflags & CLONE_NEWCGROUP) {
				uint8_t value;
				if (read(pipenum, &value, sizeof(value)) != sizeof(value))
					bail("read synchronisation value failed");
				if (value == CREATECGROUPNS) {
					write_log(DEBUG, "unshare cgroup namespace");
					if (unshare(CLONE_NEWCGROUP) < 0)
						bail("failed to unshare cgroup namespace");
				} else
					bail("received unknown synchronisation value");
			write_log(DEBUG, "signal completion to stage-0");
			s = SYNC_CHILD_FINISH;
			if (write(syncfd, &s, sizeof(s)) != sizeof(s))
				bail("failed to sync with patent: write(SYNC_CHILD_FINISH)");
			/* Close sync pipes. */
			close(sync_grandchild_pipe[0]);
			/* Free netlink data. */
			nl_free(&config);
			/* Finish executing, let the Go runtime take over. */
			write_log(DEBUG, "<= nsexec container setup");
			write_log(DEBUG, "booting up go runtime ...");
			return;
		break;
	default:
		bail("unknown stage '%d' for jump value", current_stage);
	/* Should never be reached. */
	bail("should never be reached");

从函数的注释中也可看出为什么设计成通过两次子进程的初始化来解决不同内核版本直接的兼容问题。

在隔离环境初始化完成之后,就是继续执行init命令的Action中的StartInitialization函数。

该命令主要执行的就是StartInitialization函数。

// StartInitialization loads a container by opening the pipe fd from the parent to read the configuration and state
// This is a low level implementation detail of the reexec and should not be consumed externally
func (l *LinuxFactory) StartInitialization() (err error) {
	// Get the INITPIPE.
	envInitPipe := os.Getenv("_LIBCONTAINER_INITPIPE")
	pipefd, err := strconv.Atoi(envInitPipe)
	if err != nil {
		return fmt.Errorf("unable to convert _LIBCONTAINER_INITPIPE=%s to int: %s", envInitPipe, err)
	pipe := os.NewFile(uintptr(pipefd), "pipe")  
	defer pipe.Close()
	// Only init processes have FIFOFD.
	fifofd := -1
	envInitType := os.Getenv("_LIBCONTAINER_INITTYPE")
	it := initType(envInitType)
	if it == initStandard {
		envFifoFd := os.Getenv("_LIBCONTAINER_FIFOFD")
		if fifofd, err = strconv.Atoi(envFifoFd); err != nil {
			return fmt.Errorf("unable to convert _LIBCONTAINER_FIFOFD=%s to int: %s", envFifoFd, err)
	var consoleSocket *os.File
	if envConsole := os.Getenv("_LIBCONTAINER_CONSOLE"); envConsole != "" {
		console, err := strconv.Atoi(envConsole)
		if err != nil {
			return fmt.Errorf("unable to convert _LIBCONTAINER_CONSOLE=%s to int: %s", envConsole, err)
		consoleSocket = os.NewFile(uintptr(console), "console-socket")
		defer consoleSocket.Close()
	logPipeFdStr := os.Getenv("_LIBCONTAINER_LOGPIPE")
	logPipeFd, err := strconv.Atoi(logPipeFdStr)
	if err != nil {
		return fmt.Errorf("unable to convert _LIBCONTAINER_LOGPIPE=%s to int: %s", logPipeFdStr, err)
	// clear the current process's environment to clean any libcontainer
	// specific env vars.
	os.Clearenv()
	defer func() {
		// We have an error during the initialization of the container's init,
		// send it back to the parent process in the form of an initError.
		if werr := utils.WriteJSON(pipe, syncT{procError}); werr != nil {
			fmt.Fprintln(os.Stderr, err)
			return
		if werr := utils.WriteJSON(pipe, newSystemError(err)); werr != nil {
			fmt.Fprintln(os.Stderr, err)
			return
	}()
	defer func() {
		if e := recover(); e != nil {
			err = fmt.Errorf("panic from initialization: %v, %v", e, string(debug.Stack()))
	}()
	i, err := newContainerInit(it, pipe, consoleSocket, fifofd, logPipeFd)  // 通过从环境变量获取的各个参数来初始化一个container
	if err != nil {
		return err
	// If Init succeeds, syscall.Exec will not return, hence none of the defers will be called.
	return i.Init()    // 执行

该函数主要就是通过获取环境变量里面的配置各个参数,然后调用newContainerInit函数来进行初始化。

func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, logFd int) (initer, error) {
	var config *initConfig
	if err := json.NewDecoder(pipe).Decode(&config); err != nil {
		return nil, err
	if err := populateProcessEnvironment(config.Env); err != nil {
		return nil, err
	switch t {
	case initSetns:
		return &linuxSetnsInit{
			pipe:          pipe,
			consoleSocket: consoleSocket,
			config:        config,
			logFd:         logFd,
		}, nil
	case initStandard:
		return &linuxStandardInit{   // 默认的是该init函数 返回该函数
			pipe:          pipe,
			consoleSocket: consoleSocket,
			parentPid:     unix.Getppid(),
			config:        config,
			fifoFd:        fifoFd,
			logFd:         logFd,
		}, nil
	return nil, fmt.Errorf("unknown init type %q", t)

在initStandard的使用过程中,再进行一些参数的设计操作,最后再进入执行容器中的entry的进程执行。

func (l *linuxStandardInit) Init() error {
	runtime.LockOSThread()      // 获取锁
	defer runtime.UnlockOSThread()
	if !l.config.Config.NoNewKeyring {
		if err := selinux.SetKeyLabel(l.config.ProcessLabel); err != nil {
			return err
		defer selinux.SetKeyLabel("") //nolint: errcheck
		ringname, keepperms, newperms := l.getSessionRingParams()
		// Do not inherit the parent's session keyring.
		if sessKeyId, err := keys.JoinSessionKeyring(ringname); err != nil {
			// If keyrings aren't supported then it is likely we are on an
			// older kernel (or inside an LXC container). While we could bail,
			// the security feature we are using here is best-effort (it only
			// really provides marginal protection since VFS credentials are
			// the only significant protection of keyrings).
			// TODO(cyphar): Log this so people know what's going on, once we
			//               have proper logging in 'runc init'.
			if errors.Cause(err) != unix.ENOSYS {
				return errors.Wrap(err, "join session keyring")
		} else {
			// Make session keyring searcheable. If we've gotten this far we
			// bail on any error -- we don't want to have a keyring with bad
			// permissions.
			if err := keys.ModKeyringPerm(sessKeyId, keepperms, newperms); err != nil {
				return errors.Wrap(err, "mod keyring permissions")
	if err := setupNetwork(l.config); err != nil {   // 设置网络配置
		return err
	if err := setupRoute(l.config.Config); err != nil {   // 设置路由配置
		return err
	// initialises the labeling system
	selinux.GetEnabled()
	if err := prepareRootfs(l.pipe, l.config); err != nil {  // 准备根目录
		return err
	// Set up the console. This has to be done *before* we finalize the rootfs,
	// but *after* we've given the user the chance to set up all of the mounts
	// they wanted.
	if l.config.CreateConsole {
		if err := setupConsole(l.consoleSocket, l.config, true); err != nil {
			return err
		if err := system.Setctty(); err != nil {
			return errors.Wrap(err, "setctty")
	// Finish the rootfs setup.
	if l.config.Config.Namespaces.Contains(configs.NEWNS) {
		if err := finalizeRootfs(l.config.Config); err != nil {
			return err
	if hostname := l.config.Config.Hostname; hostname != "" {  // 设置hostname
		if err := unix.Sethostname([]byte(hostname)); err != nil {
			return errors.Wrap(err, "sethostname")
	if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
		return errors.Wrap(err, "apply apparmor profile")
	for key, value := range l.config.Config.Sysctl {
		if err := writeSystemProperty(key, value); err != nil {
			return errors.Wrapf(err, "write sysctl key %s", key)
	for _, path := range l.config.Config.ReadonlyPaths {   // 设置只读路径
		if err := readonlyPath(path); err != nil {
			return errors.Wrapf(err, "readonly path %s", path)
	for _, path := range l.config.Config.MaskPaths {
		if err := maskPath(path, l.config.Config.MountLabel); err != nil {
			return errors.Wrapf(err, "mask path %s", path)
	pdeath, err := system.GetParentDeathSignal()
	if err != nil {
		return errors.Wrap(err, "get pdeath signal")
	if l.config.NoNewPrivileges {
		if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
			return errors.Wrap(err, "set nonewprivileges")
	// Tell our parent that we're ready to Execv. This must be done before the
	// Seccomp rules have been applied, because we need to be able to read and
	// write to a socket.
	if err := syncParentReady(l.pipe); err != nil {  // 同步信息
		return errors.Wrap(err, "sync ready")
	if err := selinux.SetExecLabel(l.config.ProcessLabel); err != nil {
		return errors.Wrap(err, "set process label")
	defer selinux.SetExecLabel("") //nolint: errcheck
	// Without NoNewPrivileges seccomp is a privileged operation, so we need to
	// do this before dropping capabilities; otherwise do it as late as possible
	// just before execve so as few syscalls take place after it as possible.
	if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges {
		if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
			return err
	if err := finalizeNamespace(l.config); err != nil {
		return err
	// finalizeNamespace can change user/group which clears the parent death
	// signal, so we restore it here.
	if err := pdeath.Restore(); err != nil {
		return errors.Wrap(err, "restore pdeath signal")
	// Compare the parent from the initial start of the init process and make
	// sure that it did not change.  if the parent changes that means it died
	// and we were reparented to something else so we should just kill ourself
	// and not cause problems for someone else.
	if unix.Getppid() != l.parentPid {
		return unix.Kill(unix.Getpid(), unix.SIGKILL)
	// Check for the arg before waiting to make sure it exists and it is
	// returned as a create time error.
	name, err := exec.LookPath(l.config.Args[0])  // 找到路径
	if err != nil {
		return err
	// Close the pipe to signal that we have completed our init.
	logrus.Debugf("init: closing the pipe to signal completion")
	_ = l.pipe.Close()
	// Close the log pipe fd so the parent's ForwardLogs can exit.
	if err := unix.Close(l.logFd); err != nil {
		return newSystemErrorWithCause(err, "closing log pipe fd")
	// Wait for the FIFO to be opened on the other side before exec-ing the
	// user process. We open it through /proc/self/fd/$fd, because the fd that
	// was given to us was an O_PATH fd to the fifo itself. Linux allows us to
	// re-open an O_PATH fd through /proc.
	fd, err := unix.Open("/proc/self/fd/"+strconv.Itoa(l.fifoFd), unix.O_WRONLY|unix.O_CLOEXEC, 0)  // 打开文件
	if err != nil {
		return newSystemErrorWithCause(err, "open exec fifo")
	if _, err := unix.Write(fd, []byte("0")); err != nil {   // 写入0
		return newSystemErrorWithCause(err, "write 0 exec fifo")
	// Close the O_PATH fifofd fd before exec because the kernel resets
	// dumpable in the wrong order. This has been fixed in newer kernels, but
	// we keep this to ensure CVE-2016-9962 doesn't re-emerge on older kernels.
	// N.B. the core issue itself (passing dirfds to the host filesystem) has
	// since been resolved.
	// https://github.com/torvalds/linux/blob/v4.9/fs/exec.c#L1290-L1318
	_ = unix.Close(l.fifoFd)
	// Set seccomp as close to execve as possible, so as few syscalls take
	// place afterward (reducing the amount of syscalls that users need to
	// enable in their seccomp profiles).
	if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges {
		if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
			return newSystemErrorWithCause(err, "init seccomp")
	s := l.config.SpecState
	s.Pid = unix.Getpid()
	s.Status = specs.StateCreated
	if err := l.config.Config.Hooks[configs.StartContainer].RunHooks(s); err != nil {
		return err
	if err := system.Exec(name, l.config.Args[0:], os.Environ()); err != nil {  // 替换执行
		return newSystemErrorWithCause(err, "exec user process")
	return nil

该子进程进入之后,此时run进程还在等待着任务的完成,从而可以退出。

func (c *linuxContainer) exec() error {
	path := filepath.Join(c.root, execFifoFilename)
	pid := c.initProcess.pid()
	blockingFifoOpenCh := awaitFifoOpen(path)
	for {
		select {
		case result := <-blockingFifoOpenCh:
			return handleFifoResult(result)  // 获取整个执行的输入信息 从而退出
		case <-time.After(time.Millisecond * 100):
			stat, err := system.Stat(pid)
			if err != nil || stat.State == system.Zombie {
				// could be because process started, ran, and completed between our 100ms timeout and our system.Stat() check.
				// see if the fifo exists and has data (with a non-blocking open, which will succeed if the writing process is complete).
				if err := handleFifoResult(fifoOpen(path, false)); err != nil {
					return errors.New("container process is already dead")
				return nil

至此,有关run的命令的整个执行都执行完成。

runc的实现原理,利用了管道与进程之间的通信来完成有关网络初始化,隔离环境的生成,最终进入到容器指定的入口函数。

本文主要简单概述了一个runc有关run命令的执行流程,该流程相对繁琐并且利用了不同进程进行不同任务的生成来完成初始化,相关更为完善的原理流程网上也有很多资料可查,本文仅为后续的学习做个记录。由于本人才疏学浅,如有错误请批评指正。

runcrunc作为容器的运行时,现在作为独立的项目来进行发展,runc提供一套简单的容器运行环境,包括进程的命名空间、cgroups和文件系统权限等管理的功能,runc是基于oci标准的产物,可以让大家都通过统一的接口来进行运行时的操作。其本质的管理工作也是最主要的几个重要的函数clone,unshare和setns等重要的操作函数。runc原理流程runc作为运行时,即在提供了挂载目录、运行权限等运行参数的情况下将容器启动运行,真正的作为一个运行管理的工具使用,一些例如镜像、日志配置等待交互清理功 基于持续时间的HMM的心音分割代码 这是运行出版物中概述的心音分割算法的Matlab代码: D.Springer等人,“基于Logistic回归-基于HSMM的心音分割”,IEEE Trans。 生物医学。 英文,印刷中,2015。 该代码包括特征提取,训练与时长相关的HMM,以及使用扩展的Viterbi算法对最可能的状态序列进行解码。 正在运行的代码示例可以在“ run_Example_Springer_Script.m”中看到。 David Springer 该程序是免费软件:您可以根据自由软件基金会发布的GNU通用公共许可证的条款(许可证的版本3或更高版本)重新分发和/或修改它。 分发该程序是希望它会有用,但是没有任何保证; 甚至没有对适销性或特定用途适用性的暗示保证。 有关更多详细信息,请参见GNU通用公共许可证。 您应该已经与该程序一起收到了GNU通用公共许可证的副本。 如果不是,请参见。        官方说明:runC是一个根据OCI(Open Container Initiative)标准创建并运行容器的CLI tool        解释说明:简单理解,runc其实就是Docker最核心的部分,runc可以不通过Docker引擎,直接创建,运行,销毁容器。 我的环境:        系统:CentOS Linux release 7.5.1804 (Core)...
Open Container Initiative(OCI) 开放容器计划: OCI 标准包含 运行时标准 和 镜像标准 两个部分,而 OCI 这个组织则是由 Docker, CoreOS 和其他的一些公司共同发起创建的,致力于将容器运行时和格式标准化。 即:凡是遵守此标准的实现,无论是 Docker 还是 rkt 或者其他的运行时实现,均可以通过标准的镜像启动容器。 Runc: runc 则是在 OCI 成立后,Docker 将其容器运行时 libcontainer 贡献出来后,并加以改造而成的,是Doc
在每一个Kubernetes节点中,运行着kubelet,负责为Pod创建销毁容器,kubelet预定义了API接口,通过GRPC从指定的位置调用特定的API进行相关操作。而这些CRI的实现者,如cri-o, containerd等,通过调用runc创建出容器。runc功能相对单一,即针对特定的配置,构建出容器运行指定进程,它不能直接用来构建镜像,kubernetes依赖的如cri-o这类CRI,在runc基础上增加了通过API管理镜像,容器等功能。 Kubelet,Cri-O,runc,Linux大致层级
作者:彭南光(光南) 本文整理自阿里云高级研发工程师彭南光(光南) 在 KubeCon China 2021 大会的演讲实录,分享了阿里巴巴是如何通过自研通用链路探测+定向巡检工具 KubeProbe 应对大规模集群的稳定性挑战的。关于阿里云云原生团队在本次 KubeCon 上分享的全部内容沉淀于电子书《云原生与云未来的新可能》当中,可点击文末“阅读原文”下载。 快速发现和定位问题的能力是快速恢复系统的基石,只有先做到快速发现和定位问题,才能谈如何解决问题,尽量减少用户损失。那么如何在复杂的大规模场景中
一、容器的标准OCF 容器标准化的目标:操作标准化、内容无关、基础设施无关。 runc是OCF的其中一个具体实现,下面从runc的角度来介绍OCF,这并不是OCF的全部标准 1、容器标准包(bundle) 现在的runc下的bundle包含了两个部分,rootfs目录必须与config.json文件同时存在容器目录最顶层。 1、 /rootfs目录,根文件系统目录,包含了容器执行所需的必要环境依赖,如/bin、/var、/lib、/dev、/usr等目录及相应文件。 2、 配置文件config.json,包
在每一个Kubernetes节点中,运行着kubelet,负责为Pod创建销毁容器,kubelet预定义了API接口,通过GRPC从指定的位置调用特定的API进行相关操作。而这些CRI的实现者,如cri-o, containerd等,通过调用runc创建出容器。runc功能相对单一,即针对特定的配置,构建出容器运行指定进程,它不能直接用来构建镜像,kubernetes依赖的如cri-o这类CRI,在runc基础上增加了通过API管理镜像,容器等功能。 Kubelet,Cri-O,runc,Linux大致层
云原生时代,容器有多重要不言而喻。而runc是一个 CLI 工具,用于根据 OCI 规范在 Linux 上生成和运行容器,是容器的底层技术之一。本系列从源代码层面深入剖析runc项目,带你解开容器神秘面纱的背后世界。 在剖析runc之前,我们需要重点了解下oci标准。 Open Container Initiative (OCI) 是Linux基金
安装docker,其实是安装了docker客户端、dockerd等一系列的组件,其中比较重要的有下面几个。docker程序是一个客户端工具,用来把用户的请求发送给dockerdaemon(dockerd)。该程序的安装路径为:dockerdaemon(dockerd),一般也会被称为dockerengine。该程序的安装路径为:详情请参考《Containerd简介》。该程序的安装路径为:它是containerd的组件,是容器的运行时载体,我们在docker宿主机上看到的shim也正是代表着一个个通过调用containerd启动的docker容器。该程序的安装路径为:详情请参考《RunC简介》。
本文接 探索runC(上) 前文讲到,newParentProcess() 根据源自 config.json 的配置,最终生成变量 initProcess ,这个 initProcess 包含的信息主要有 cmd 记录了要执行的可执行文件名,即 "/proc/self/exe init",注意不要和容器要执行的 sleep 5...
编译环境如下。 [root@VM-16-15-centos runc]# uname -a Linux VM-16-15-centos 3.10.0-1160.11.1.el7.x86_64 #1 SMP Fri Dec 18 16:34:56 UTC 2020 x86_64 x86_64 x86_64 GNU/Linux [root@VM-16-15-centos bin]# go/bin/go version go version go1.16.11 linux/amd64 [root@VM-16-15
好的,我将以 CSDN ChitGPT 的角色回答您的问题。 Spring Boot 是一个用于开发和构建单独的、基于 Spring 的应用程序的框架。在运行 Spring Boot 应用程序时,可以使用命令“springboot run”来启动应用程序。下面是 springboot run 命令的工作原理: 1. 首先,Spring Boot Maven 插件将构建和打包的应用程序文件加载到目标文件夹。默认情况下,Spring Boot 会在“target”目录中查找 JAR 包。 2. 接下来,Spring Boot Maven 插件创建一个包含应用程序类路径和配置的类加载器,并启动应用程序的主类。这个主类是我们在应用程序中指定的启动类。 3. Spring Boot 应用程序在启动时会扫描类路径中的注释,以查找 Spring 组件和其他应用程序上下文文件(例如XML文件)。使用此信息,应用程序可以自动配置 Spring Framework。 4. 在成功完成 Spring Framework 上下文初始化后,Spring Boot 将启动内嵌的 Tomcat 或 Jetty Web 服务器,并将应用程序部署到服务器。 5. 最后,Spring Boot 程序将在启动过程中显示一些有用的信息,例如日志级别和端口号等。 总之,Spring Boot 通过自动配置和内嵌的 Web 服务器使得构建独立的 Spring 应用程序变得更加简单和快捷。