内存分配函数

[TOC]

mallocgc() 内存主入口

// 分配特定字节大小的对象.
// 小对象 直接优先从线程缓存 mcache 的 free list 中分配
// 大对象 (> 32 kB) 会直接从页堆 mheap 中分配内存.
func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
	if gcphase == _GCmarktermination { // _GCmarktermination 阶段不能进行内存分配
		throw("mallocgc called with gcphase == _GCmarktermination")
	}

	if size == 0 {  // 所有大小为 0 的对象都使用同一块内存地址
		return unsafe.Pointer(&zerobase)
	}

	// debug.sbrk ...

    // 辅助 gorotuine，主要是在 GC 阶段，负责内存分配
    // 如果没有在执行 GC，那么这个 g 可以是 nil
	var assistG *g //
	if gcBlackenEnabled != 0 {
        // 当前的用户 G 负责这次分配
		assistG = getg()
		if assistG.m.curg != nil {
			assistG = assistG.m.curg
		}
		// Charge the allocation against the G. We'll account
		// for internal fragmentation at the end of mallocgc.
		assistG.gcAssistBytes -= int64(size)

		if assistG.gcAssistBytes < 0 {
			// This G is in debt. Assist the GC to correct
			// this before allocating. This must happen
			// before disabling preemption.
			gcAssistAlloc(assistG)
		}
	}

	// 设置 mp.mallocing 状态，以使其避免被 GC 抢占
	mp := acquirem()
	if mp.mallocing != 0 {
		throw("malloc deadlock")
	}
	if mp.gsignal == getg() {
		throw("malloc during signal")
	}
	mp.mallocing = 1

	shouldhelpgc := false
	dataSize := size
	var c *mcache
	if mp.p != 0 {
		c = mp.p.ptr().mcache
	} else {
		// We will be called without a P while bootstrapping,
		// in which case we use mcache0, which is set in mallocinit.
		// mcache0 is cleared when bootstrapping is complete,
		// by procresize.
		c = mcache0
		if c == nil {
			throw("malloc called with no P")
		}
	}
	var span *mspan
	var x unsafe.Pointer
	noscan := typ == nil || typ.ptrdata == 0
	if size <= maxSmallSize {
		if noscan && size < maxTinySize {  // 小于 16 字节的微且没有指针的对象内存分配
			// Tiny allocator.
			//
            // Tiny allocator 会将几个 tiny 的内存分配请求组合为单个内存块。
            // 只有当该内存块中所有子对象都变为待回收后，才会释放这部分内存块。
            // 子对象必须是 noscan 类型(不包含指针)，这样可以确保浪费的内存是可控的。
            //
            // 用来组合小对象的内存块的大小是可调(通过maxTinySize)。
            // 当前的设置是 16 bytes，最坏情况下会浪费 2x 内存(当所有子对象中只有一个对象是可达状态时)。
            // 如果修改为 8 bytes ，虽然完全没有浪费，但是却不太可能进行组合操作。
            // 32 bytes 能够提高合并的可能性，但是最坏情况下会造成 4x 浪费。
            // 不考虑 block 大小的话，最好的情况下是 8x。 
            //
            // 从 tiny allocator 中获得的内存不能被显式释放。
            // 因此当一个对象需要被显式释放时，我们要确保它的 size >= maxTinySize
            //
            // 当对象都是由 tiny allocator 分配时，SetFinalizer 有一种特殊情况，
            // 这种情况下会允许在一个内存块中设置内部字节的 finalizers
            //
            // 实现 tiny allocator 的主要目标是对那些小字符串和独立的逃逸变量。
            // 在一个 json benchmark 中，这个 allocator 会将其性能提升 ~12%
            // 并且使堆大小降低了 ~20%。
			off := c.tinyoffset
            // 对齐，调整偏移量 
			if size&7 == 0 {
				off = alignUp(off, 8)
			} else if size&3 == 0 {
				off = alignUp(off, 4)
			} else if size&1 == 0 {
				off = alignUp(off, 2)
			}
			if off+size <= maxTinySize && c.tiny != 0 {
                // 把将tiny对象适配到已存在的 tiny 块
				x = unsafe.Pointer(c.tiny + off)
				c.tinyoffset = off + size
				c.local_tinyallocs++
				mp.mallocing = 0
				releasem(mp)
				return x
			}
			// 分配一个新的 maxTinySize 大小的块， 放入 mcache.alloc 数组中
			span = c.alloc[tinySpanClass] // 即spanClass == 2 ，16字节的
            // nextFreeFast 从 mspam 返回下一个空闲对象，如果有的话。否则返回0。
			v := nextFreeFast(span)   
			if v == 0 { 
                // 若 mspan 没有空闲内存，则从中心缓存或页堆获取新的 mspan 内存对象
				v, span, shouldhelpgc = c.nextFree(tinySpanClass) 
			}
			x = unsafe.Pointer(v)
			(*[2]uint64)(x)[0] = 0
			(*[2]uint64)(x)[1] = 0
			// 根据剩余的空闲空间，来看看我们是否需要将已有的 tiny 块替换为新块
			if size < c.tinyoffset || c.tiny == 0 {
				c.tiny = uintptr(x)
				c.tinyoffset = size
			}
			size = maxTinySize
		} else {
            // 16B到32KB的小对象以及含有指针的微对象的内存分配
			var sizeclass uint8
			if size <= smallSizeMax-8 {
				sizeclass = size_to_class8[divRoundUp(size, smallSizeDiv)]
			} else {
				sizeclass = size_to_class128[divRoundUp(size-smallSizeMax, largeSizeDiv)]
			}
			size = uintptr(class_to_size[sizeclass])
			spc := makeSpanClass(sizeclass, noscan)  // 计算对象合适的 spanClass
			span = c.alloc[spc]    // 根据 spanClass 取对应的 mspan
			v := nextFreeFast(span)
			if v == 0 {
                //从中心缓存或页堆获取新的 mspan 内存对象
				v, span, shouldhelpgc = c.nextFree(spc)  
			}
			x = unsafe.Pointer(v)
			if needzero && span.needzero != 0 {
                // 用 memclrNoHeapPointers 进行零初始化
				memclrNoHeapPointers(unsafe.Pointer(v), size)
			}
		}
	} else {
        // 大于 32KB 的大对象内存分配
		shouldhelpgc = true
		systemstack(func() {
			span = largeAlloc(size, needzero, noscan) // 直接从 mheap 上分配 span
		})
		span.freeindex = 1
		span.allocCount = 1
		x = unsafe.Pointer(span.base())
		size = span.elemsize
	}

	var scanSize uintptr
	if !noscan {
		// If allocating a defer+arg block, now that we've picked a malloc size
		// large enough to hold everything, cut the "asked for" size down to
		// just the defer header, so that the GC bitmap will record the arg block
		// as containing nothing at all (as if it were unused space at the end of
		// a malloc block caused by size rounding).
		// The defer arg areas are scanned as part of scanstack.
		if typ == deferType {
			dataSize = unsafe.Sizeof(_defer{})
		}
		heapBitsSetType(uintptr(x), size, dataSize, typ)
		if dataSize > typ.size {
			// Array allocation. If there are any
			// pointers, GC has to scan to the last
			// element.
			if typ.ptrdata != 0 {
				scanSize = dataSize - typ.size + typ.ptrdata
			}
		} else {
			scanSize = typ.ptrdata
		}
		c.local_scan += scanSize
	}

    // 确保上面初始化 x 并设置 heap 对应的 bits 之类的存储操作发生
    // 在 caller 使 x 对垃圾收集器可见之前。否则的话，在那些弱有序
    // 的平台上，垃圾收集器可能会跟踪一个指向 x 的指针，
    // 但是看到的内存却是未初始化或者 heap bits 是过期的数据
	publicationBarrier()

	// GC 期间分配的黑色对象。所有持有 nil 对象的槽都无需扫描。
    // 这里可能会和 GC 发生竞争，如果正在标记 bit 时，需要原子地来执行这个动作
	if gcphase != _GCoff {
		gcmarknewobject(span, uintptr(x), size, scanSize)
	}

	if raceenabled {
		racemalloc(x, size)
	}

	if msanenabled {
		msanmalloc(x, size)
	}

	mp.mallocing = 0
	releasem(mp)

	if debug.allocfreetrace != 0 {
		tracealloc(x, size, typ)
	}

	if rate := MemProfileRate; rate > 0 {
		if rate != 1 && size < c.next_sample {
			c.next_sample -= size
		} else {
			mp := acquirem()
			profilealloc(mp, x, size)
			releasem(mp)
		}
	}

	if assistG != nil {
		// Account for internal fragmentation in the assist
		// debt now that we know it.
		assistG.gcAssistBytes -= int64(size - dataSize)
	}

	if shouldhelpgc {
		if t := (gcTrigger{kind: gcTriggerHeap}); t.test() {
			gcStart(t)
		}
	}

	return x
}

nextFreeFast() 线程缓存分配

nextFreeFast() 用于快速获取空闲的内存，没有则返回0

func nextFreeFast(s *mspan) gclinkptr {
	theBit := sys.Ctz64(s.allocCache) // Is there a free object in the allocCache?
	if theBit < 64 {
		result := s.freeindex + uintptr(theBit)
		if result < s.nelems {
			freeidx := result + 1
			if freeidx%64 == 0 && freeidx != s.nelems {
				return 0
			}
			s.allocCache >>= uint(theBit + 1)
			s.freeindex = freeidx  // 标记下一个空对象索引
			s.allocCount++
			return gclinkptr(result*s.elemsize + s.base())
		}
	}
	return 0
}

runtime.nextFreeFast 会利用内存管理单元中的 mspan.allocCache 字段，快速找到该字段中bitmao位是 1 的位数， 1 表示该位对应的内存空间是空闲的。

找到了空闲的对象后，就可以更新内存管理单元的 allocCache、freeindex 等字段并返回该片内存地址；如果没有找到空闲的内存，运行时会通过 runtime.mcache.nextFree 找到新的内存管理单元。

nextFree() 中心缓存分配

如果我们在线程缓存中没有找到可用的内存管理单元，runtime.mcache.refill 使用中心缓存中的内存管理单元替换已经不存在可用对象的结构体，该方法会调用新结构体的 runtime.mspan.nextFreeIndex 获取空闲的内存并返回。

// nextFree 会从 cached span 中寻找下一个空闲对象
// 如果未找到，会用一个持有可用对象的 span 来填充 cache 并将该对象返回，同时携带一个 flag
// 该 flag 会标志此次分配是否是重量级的内存分配操作。如果是重量级分配，那么 caller 必须
// 确定是否需要启动一次新的 GC 循环，如果 GC 当前活跃的话，该 goroutine 可能需要协助进行 GC
func (c *mcache) nextFree(spc spanClass) (v gclinkptr, s *mspan, shouldhelpgc bool) {
   s = c.alloc[spc]
   shouldhelpgc = false
   freeIndex := s.nextFreeIndex()
   if freeIndex == s.nelems {
      // The span is full.
      if uintptr(s.allocCount) != s.nelems {
         println("runtime: s.allocCount=", s.allocCount, "s.nelems=", s.nelems)
         throw("s.allocCount != s.nelems && freeIndex == s.nelems")
      }
      c.refill(spc)  // 从中心列表扩充新的 span
      shouldhelpgc = true
      s = c.alloc[spc]

      freeIndex = s.nextFreeIndex()
   }

   if freeIndex >= s.nelems {
      throw("freeIndex is not valid")
   }

   v = gclinkptr(freeIndex*s.elemsize + s.base())
   s.allocCount++
   if uintptr(s.allocCount) > s.nelems {
      println("s.allocCount=", s.allocCount, "s.nelems=", s.nelems)
      throw("s.allocCount > s.nelems")
   }
   return
}

mcache.refill()

mcache.refill 方法会为线程缓存 mcache 获取一个指定跨度类的内存管理单元 mspan，被替换的单元不能包含空闲的内存空间，而获取的单元中需要至少包含一个空闲对象用于分配内存：

// refill acquires a new span of span class spc for c. This span will
// have at least one free object. The current span in c must be full.
//
// Must run in a non-preemptible context since otherwise the owner of
// c could change.
func (c *mcache) refill(spc spanClass) {
   s := c.alloc[spc] // 把正在使用的 span 返还给中心列表 mcentral

   if uintptr(s.allocCount) != s.nelems {
      throw("refill of span with free space remaining")
   }
   if s != &emptymspan {
      // 标记这个 span 不在被mcache使用
      if s.sweepgen != mheap_.sweepgen+3 {
         throw("bad sweepgen in refill")
      }
      if go115NewMCentralImpl {
         mheap_.central[spc].mcentral.uncacheSpan(s)
      } else {
         atomic.Store(&s.sweepgen, mheap_.sweepgen)
      }
   }

   // 从中心列表 mcentrail 申请一个特定spanclass新的 span
   s = mheap_.central[spc].mcentral.cacheSpan()
   if s == nil {
      throw("out of memory")
   }

   if uintptr(s.allocCount) == s.nelems {
      throw("span has no free space")
   }

   // 表明此 span 已被缓存，并在下一个扫描阶段防止异步扫描。
   s.sweepgen = mheap_.sweepgen + 3

   c.alloc[spc] = s
}

sweepgen GC 扫描计数值，每次GC后会自增2，每个等待处理的 span 也有此字段。

largeAlloc() 大对象mheap分配

运行时对于大于 32KB 的大对象会单独处理，我们不会从线程缓存或者中心缓存中获取内存管理单元，而是直接在系统的栈中调用 runtime.largeAlloc 函数分配大片的内存。该函数会计算分配该对象所需要的页数，它会按照页数为在堆上申请内存。申请内存时会创建一个跨度类为 0 的 runtime.spanClass 并调用 runtime.mheap.alloc 分配一个管理对应内存的管理单元。

func largeAlloc(size uintptr, needzero bool, noscan bool) *mspan {
   // print("largeAlloc size=", size, "\n")

   if size+_PageSize < size {
      throw("out of memory")
   }
    // 计算所需页数，即 size 处以 page 大小 8KB
   npages := size >> _PageShift  
   if size&_PageMask != 0 {
      npages++
   }

   // Deduct credit for this span allocation and sweep if
   // necessary. mHeap_Alloc will also sweep npages, so this only
   // pays the debt down to npage pages.
   deductSweepCredit(npages*_PageSize, npages)

   spc := makeSpanClass(0, noscan)   //计算 spanClass
   s := mheap_.alloc(npages, spc, needzero)  // 从mheap申请内存
   if s == nil {
      throw("out of memory")
   }
   if go115NewMCentralImpl {
      // Put the large span in the mcentral swept list so that it's
      // visible to the background sweeper.
      mheap_.central[spc].mcentral.fullSwept(mheap_.sweepgen).push(s)
   }
   s.limit = s.base() + size
   heapBitsForAddr(s.base()).initSpan(s)
   return s
}

systemstack() 函数栈运行

runtime.systemstack 在系统栈上运行给定的函数。Go 文件中声明了函数签名。针对不同的cpu架构采用汇编语言实现。

// systemstack runs fn on a system stack.
// If systemstack is called from the per-OS-thread (g0) stack, or
// if systemstack is called from the signal handling (gsignal) stack,
// systemstack calls fn directly and returns.
// Otherwise, systemstack is being called from the limited stack
// of an ordinary goroutine. In this case, systemstack switches
// to the per-OS-thread stack, calls fn, and switches back.
// It is common to use a func literal as the argument, in order
// to share inputs and outputs with the code around the call
// to system stack:
//
//	... set up y ...
//	systemstack(func() {
//		x = bigcall(y)
//	})
//	... use x ...
//
//go:noescape
func systemstack(fn func())

以 amd64 为例

// func systemstack(fn func())
TEXT runtime·systemstack(SB), NOSPLIT, $0-8
	MOVQ	fn+0(FP), DI	// DI = fn
	get_tls(CX)
	MOVQ	g(CX), AX	// AX = g
	MOVQ	g_m(AX), BX	// BX = m

	CMPQ	AX, m_gsignal(BX)
	JEQ	noswitch

	MOVQ	m_g0(BX), DX	// DX = g0
	CMPQ	AX, DX
	JEQ	noswitch

	CMPQ	AX, m_curg(BX)
	JNE	bad

	// switch stacks
	// save our state in g->sched. Pretend to
	// be systemstack_switch if the G stack is scanned.
	MOVQ	$runtime·systemstack_switch(SB), SI
	MOVQ	SI, (g_sched+gobuf_pc)(AX)
	MOVQ	SP, (g_sched+gobuf_sp)(AX)
	MOVQ	AX, (g_sched+gobuf_g)(AX)
	MOVQ	BP, (g_sched+gobuf_bp)(AX)

	// switch to g0
	MOVQ	DX, g(CX)
	MOVQ	(g_sched+gobuf_sp)(DX), BX
	// make it look like mstart called systemstack on g0, to stop traceback
	SUBQ	$8, BX
	MOVQ	$runtime·mstart(SB), DX
	MOVQ	DX, 0(BX)
	MOVQ	BX, SP

	// call target function
	MOVQ	DI, DX
	MOVQ	0(DI), DI
	CALL	DI

	// switch back to g
	get_tls(CX)
	MOVQ	g(CX), AX
	MOVQ	g_m(AX), BX
	MOVQ	m_curg(BX), AX
	MOVQ	AX, g(CX)
	MOVQ	(g_sched+gobuf_sp)(AX), SP
	MOVQ	$0, (g_sched+gobuf_sp)(AX)
	RET

noswitch:
	// already on m stack; tail call the function
	// Using a tail call here cleans up tracebacks since we won't stop
	// at an intermediate systemstack.
	MOVQ	DI, DX
	MOVQ	0(DI), DI
	JMP	DI

bad:
	// Bad: g is not gsignal, not g0, not curg. What is it?
	MOVQ	$runtime·badsystemstack(SB), AX
	CALL	AX
	INT	$3

fixalloc 固定分配器

从 runtime 的角度,整个进程内的对象可分为两类:一种,自然是从 arena 区域分配的用户对象;另一种,则是运行时自身运行和管理所需的管理对象,比如管理 arena 内存片段的 span,提供无锁分配的 mcache等等。

管理对象的生命周期并不像用户对象那样复杂,且类型和长度都相对固定,所以算法策略显然不用那么复杂。还有,它们相对较长的生命周期也不适合占用 arena 区域,否则会导致更多碎片。为此,运行时专门设计了 fixalloc 固定分配器来为管理对象分配内。

fixAlloc是一个简单的用于固定大小对象的 free-list 分配器。fixAlloc 底层会调用 sysAlloc 方法分配内存, malloc 使用包裹在 sysAlloc 周围的 fixAlloc 来管理mcache和mspan等内存管理 off-heap 对象，如：

mheap: 全局唯一的，基于页(8K)管理堆内存.
mspan: mheap 中的页组成，内存管理单元.
mcentral: 按特定的 spanClass 管理 span.
mcache: 每个 P 的线程mspan缓存.
mstats: 内存分配数据统计.

type fixalloc struct {
   size   uintptr   // 固定长度分配
   first  func(arg, p unsafe.Pointer) //关联函数，初始化分配器时传入，called first time p is returned
   arg    unsafe.Pointer 	// 关联函数调用参数
   list   *mlink			// 复用链表
   chunk  uintptr // 内存块指针 use uintptr instead of unsafe.Pointer to avoid write barriers
   nchunk uint32  // 内存块长度
   inuse  uintptr // 已使用的字节数
   stat   *uint64
   zero   bool // zero allocations
}

//go:notinheap
type mlink struct {
	next *mlink
}

fixalloc.init():按给定的大小初始化分配器

func (f *fixalloc) init(size uintptr, first func(arg, p unsafe.Pointer), arg unsafe.Pointer, stat *uint64)

fixalloc.alloc():分配内存

func (f *fixalloc) alloc() unsafe.Pointer {
   if f.size == 0 {
      print("runtime: use of FixAlloc_Alloc before FixAlloc_Init\n")
      throw("runtime: internal error")
   }
	// 优先从复用链表获取内存
   if f.list != nil {
      v := unsafe.Pointer(f.list)
      f.list = f.list.next
      f.inuse += f.size
      if f.zero {
         memclrNoHeapPointers(v, f.size)
      }
      return v
   }
   // 如果剩余内存块已不足分配,则获取新的16KB内存块， _FixAllocChunk == 16 << 10 字节
   if uintptr(f.nchunk) < f.size {
      // persistentalloc 函数会调用 sysAlloc
      f.chunk = uintptr(persistentalloc(_FixAllocChunk, 0, f.stat))
      f.nchunk = _FixAllocChunk
   }
   //获取新内存块时执行关联函数(通常用作初始化和贝数据)
   v := unsafe.Pointer(f.chunk)
   if f.first != nil {
      f.first(f.arg, v)
   }
   // 更新属性
   f.chunk = f.chunk + f.size
   f.nchunk -= uint32(f.size)
   f.inuse += f.size
   return v
}

fixalloc.free()释放内存，即放回服用链表即可

func (f *fixalloc) free(p unsafe.Pointer) {
	f.inuse -= f.size
	v := (*mlink)(p)
	v.next = f.list
	f.list = v
}

当运行时在初始化mheap时,根据不同的大小，一共构建了5 种固定分配器。

type mheap struct {
    ...
    spanalloc             fixalloc // allocator for span*
	cachealloc            fixalloc // allocator for mcache*
	specialfinalizeralloc fixalloc // allocator for specialfinalizer*
	specialprofilealloc   fixalloc // allocator for specialprofile*
	arenaHintAlloc        fixalloc // allocator for arenaHints
}

// Initialize the heap.
func (h *mheap) init() {
    ...
    h.spanalloc.init(unsafe.Sizeof(mspan{}), recordspan, unsafe.Pointer(h), &memstats.mspan_sys)
	h.cachealloc.init(unsafe.Sizeof(mcache{}), nil, nil, &memstats.mcache_sys)
	h.specialfinalizeralloc.init(unsafe.Sizeof(specialfinalizer{}), nil, nil, &memstats.other_sys)
	h.specialprofilealloc.init(unsafe.Sizeof(specialprofile{}), nil, nil, &memstats.other_sys)
	h.arenaHintAlloc.init(unsafe.Sizeof(arenaHint{}), nil, nil, &memstats.other_sys)
    ...
}

固定分配器持有的这个16KB内存块通过函数persistentalloc分自 persistent 区域。该区域在很多地方为运行时提供后备内存,目的同样是为了减少并发锁,减少内存申请系统调用。

persistentalloc

type persistentAlloc struct {
   base *notInHeap
   off  uintptr
}

var globalAlloc struct {
   mutex
   persistentAlloc
}

func persistentalloc(size, align uintptr, sysStat *uint64) unsafe.Pointer {
	var p *notInHeap
	systemstack(func() {
		p = persistentalloc1(size, align, sysStat)
	})
	return unsafe.Pointer(p)
}


// Must run on system stack because stack growth can (re)invoke it.
// See issue 9174.
//go:systemstack
func persistentalloc1(size, align uintptr, sysStat *uint64) *notInHeap {
	const (
		maxBlock = 64 << 10 // VM reservation granularity is 64K on windows
	)

	if size == 0 {
		throw("persistentalloc: size == 0")
	}
	if align != 0 {
		if align&(align-1) != 0 {
			throw("persistentalloc: align is not a power of 2")
		}
		if align > _PageSize {
			throw("persistentalloc: align is too large")
		}
	} else {
		align = 8
	}
    // 如若大雨 分配大于64KB的内存块
	if size >= maxBlock {
		return (*notInHeap)(sysAlloc(size, sysStat))
	}

	mp := acquirem()
    //后备内存块存放位置(本地或全局)
	var persistent *persistentAlloc
	if mp != nil && mp.p != 0 {
		persistent = &mp.p.ptr().palloc
	} else {
		lock(&globalAlloc.mutex)
		persistent = &globalAlloc.persistentAlloc
	}
    //偏移位置对齐
	persistent.off = alignUp(persistent.off, align)
    //如果后备块空间不足,则重新申请
	if persistent.off+size > persistentChunkSize || persistent.base == nil {
        //申请新256KB后备内存
		persistent.base = (*notInHeap)(sysAlloc(persistentChunkSize, &memstats.other_sys))
		if persistent.base == nil {
			if persistent == &globalAlloc.persistentAlloc {
				unlock(&globalAlloc.mutex)
			}
			throw("runtime: cannot allocate memory")
		}

		// Add the new chunk to the persistentChunks list.
		for {
			chunks := uintptr(unsafe.Pointer(persistentChunks))
			*(*uintptr)(unsafe.Pointer(persistent.base)) = chunks
			if atomic.Casuintptr((*uintptr)(unsafe.Pointer(&persistentChunks)), chunks, uintptr(unsafe.Pointer(persistent.base))) {
				break
			}
		}
		persistent.off = alignUp(sys.PtrSize, align)
	}
	p := persistent.base.add(persistent.off)
	persistent.off += size
	releasem(mp)
	if persistent == &globalAlloc.persistentAlloc {
		unlock(&globalAlloc.mutex)
	}

	if sysStat != &memstats.other_sys {
		mSysStatInc(sysStat, size)
		mSysStatDec(&memstats.other_sys, size)
	}
	return p
}

recordspan()

在 mheap 初始化阶段 mheap.init()中，固定分配器函数h.spanalloc.init()，初始化时，传入了关联函数 recordspan(),其作用是按需扩张 mheap.allspans 的存储空间，mheap.allspans 是一个[]*mspan数组，里面保存了所有 mspan 对象指针,供垃圾回收时遍历。

内存分配器 spans 区域虽然保存了 page/span映射关系,但有很多重复,基于效率考虑,并不适合用来作为遍历对象。

// The bootstrap sequence is:
//
//	call osinit
//	call schedinit
//	make & queue new G
//	call runtime·mstart
//
// The new G calls runtime·main.
func schedinit() {
    ...
    mallocinit()
    ...
}

func mallocinit() {
    ...
    mheap_.init()
    ...
}
// Initialize the heap.
func (h *mheap) init() {
    ...
    h.spanalloc.init(unsafe.Sizeof(mspan{}), recordspan, unsafe.Pointer(h), &memstats.mspan_sys)
	...
}

//go:nowritebarrierrec
func recordspan(vh unsafe.Pointer, p unsafe.Pointer) {
	h := (*mheap)(vh)
	s := (*mspan)(p)
    // 如果 allspans 切片空间已满
	if len(h.allspans) >= cap(h.allspans) {
        // 计算新的容量
		n := 64 * 1024 / sys.PtrSize
		if n < cap(h.allspans)*3/2 {
			n = cap(h.allspans) * 3 / 2
		}
        // 申请新的内存空间，直接用指针写切片内部属性。
		var new []*mspan
		sp := (*slice)(unsafe.Pointer(&new))
		sp.array = sysAlloc(uintptr(n)*sys.PtrSize, &memstats.other_sys)
		if sp.array == nil {
			throw("runtime: cannot allocate memory")
		}
		sp.len = len(h.allspans)
		sp.cap = n
        //如果原空间有数据，复制数据新切片 new，然后释放
		if len(h.allspans) > 0 {
			copy(new, h.allspans)
		}
		oldAllspans := h.allspans
        // h.allspans 指向新申请的空间
		*(*notInHeapSlice)(unsafe.Pointer(&h.allspans)) = *(*notInHeapSlice)(unsafe.Pointer(&new))
        // 释放旧空间
		if len(oldAllspans) != 0 {
			sysFree(unsafe.Pointer(&oldAllspans[0]), uintptr(cap(oldAllspans))*unsafe.Sizeof(oldAllspans[0]), &memstats.other_sys)
		}
	}
	h.allspans = h.allspans[:len(h.allspans)+1]
	h.allspans[len(h.allspans)-1] = s
}

上面的扩张直接用 mmap在 arena 以外申请空间，即off-heap内存，该部分空间不受GC管辖，因为此空间就是GC的一部分，其他应用申请内存对 ` h.allspans`的 append 操作引发的扩张是在 arena区域的。