You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
structmm_struct {
structvm_area_struct*mmap; /* list of VMAs */structrb_rootmm_rb; /*VMA形成的红黑树*/u32vmacache_seqnum; /* per-thread vmacache */#ifdefCONFIG_MMUunsigned long (*get_unmapped_area) (structfile*filp,
unsigned longaddr, unsigned longlen,
unsigned longpgoff, unsigned longflags);
#endifunsigned longmmap_base; /* base of mmap area */unsigned longmmap_legacy_base; /* base of mmap area in bottom-up allocations */unsigned longtask_size; /* size of task vm space */unsigned longhighest_vm_end; /* highest vma end address */pgd_t*pgd; /*页全局目录*/atomic_tmm_users; /* How many users with user space? */atomic_tmm_count; /* How many references to "struct mm_struct" (users count as 1) */atomic_long_tnr_ptes; /* PTE page table pages */#ifCONFIG_PGTABLE_LEVELS>2atomic_long_tnr_pmds; /* PMD page table pages */#endifintmap_count; /* number of VMAs */spinlock_tpage_table_lock; /* Protects page tables and some counters */structrw_semaphoremmap_sem; /*VMA的信号量*/structlist_headmmlist; /* List of maybe swapped mm's. These are globally strung * together off init_mm.mmlist, and are protected * by mmlist_lock */unsigned longhiwater_rss; /* High-watermark of RSS usage */unsigned longhiwater_vm; /* High-water virtual memory usage */unsigned longtotal_vm; /* Total pages mapped */unsigned longlocked_vm; /* Pages that have PG_mlocked set */unsigned longpinned_vm; /* Refcount permanently increased */unsigned longdata_vm; /* VM_WRITE & ~VM_SHARED & ~VM_STACK */unsigned longexec_vm; /* VM_EXEC & ~VM_WRITE & ~VM_STACK */unsigned longstack_vm; /* VM_STACK */unsigned longdef_flags;
unsigned longstart_code, end_code, start_data, end_data; /*代码段的开始和结束地址,数据段的开始和结束地址*/unsigned longstart_brk, brk, start_stack; /*堆的开始地址和尾地址,进程栈的首地址*/unsigned longarg_start, arg_end, env_start, env_end; /*命令行参数的开始和结束地址,环境变量的开始和结束地址*/unsigned longsaved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv *//* * Special counters, in some configurations protected by the * page_table_lock, in other configurations by being atomic. */structmm_rss_statrss_stat;
structlinux_binfmt*binfmt;
cpumask_var_tcpu_vm_mask_var;
/* Architecture-specific MM context */mm_context_tcontext;
unsigned longflags; /* Must use atomic bitops to access the bits */structcore_state*core_state; /* coredumping support */#ifdefCONFIG_AIOspinlock_tioctx_lock; /* AIO I/O 链表锁 */structkioctx_table__rcu*ioctx_table;
#endif#ifdefCONFIG_MEMCG/* * "owner" points to a task that is regarded as the canonical * user/owner of this mm. All of the following must be true in * order for it to be changed: * * current == mm->owner * current->mm != mm * new_owner->mm == mm * new_owner->alloc_lock is held */structtask_struct__rcu*owner;
#endif/* store ref to file /proc/<pid>/exe symlink points to */structfile__rcu*exe_file;
#ifdefCONFIG_MMU_NOTIFIERstructmmu_notifier_mm*mmu_notifier_mm;
#endif#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKSpgtable_tpmd_huge_pte; /* protected by page_table_lock */#endif#ifdefCONFIG_CPUMASK_OFFSTACKstructcpumaskcpumask_allocation;
#endif#ifdefCONFIG_NUMA_BALANCING/* * numa_next_scan is the next time that the PTEs will be marked * pte_numa. NUMA hinting faults will gather statistics and migrate * pages to new nodes if necessary. */unsigned longnuma_next_scan;
/* Restart point for scanning and setting pte_numa */unsigned longnuma_scan_offset;
/* numa_scan_seq prevents two threads setting pte_numa */intnuma_scan_seq;
#endif#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
/* * An operation with batched TLB flushing is going on. Anything that * can move process memory needs to flush the TLB when moving a * PROT_NONE or PROT_NUMA mapped page. */booltlb_flush_pending;
#endifstructuprobes_stateuprobes_state;
#ifdefCONFIG_X86_INTEL_MPX/* address of the bounds directory */void__user*bd_addr;
#endif#ifdefCONFIG_HUGETLB_PAGEatomic_long_thugetlb_usage;
#endif
};
staticintcopy_mm(unsigned longclone_flags, structtask_struct*tsk)
{
...
/* * Are we cloning a kernel thread? * * We need to steal a active VM for that.. */oldmm=current->mm;
if (!oldmm)
return0;
...
if (clone_flags&CLONE_VM) {
atomic_inc(&oldmm->mm_users);
mm=oldmm;
goto good_mm;
}
retval=-ENOMEM;
mm=dup_mm(tsk);
if (!mm)
goto fail_nomem;
...
}
撤销内存描述符
do_exit()
->exit_mm(tsk)
->mmput(mm)
if (atomic_dec_and_test(&mm->mm_users))
mmdrop(mm);
if (unlikely(atomic_dec_and_test(&mm->mm_count)))
__mmdrop(mm);
|
Vfree_mm(mm) == (kmem_cache_free(mm_cachep, (mm)))
/* * context_switch - switch to the new MM and the new thread's register state. */static__always_inlinestructrq*context_switch(structrq*rq, structtask_struct*prev,
structtask_struct*next)
{
structmm_struct*mm, *oldmm;
...
mm=next->mm; /*mm指向将被调度的进程的内存描述符*/oldmm=prev->active_mm; /*oldmm指向前一进程(即当前进程)的active_mm*/
...
if (!mm) { /*如果被调度进程是内核线程*//*保留前一进程的地址空间,这样在需要时内核线程可以使用前一进程的页表。 内核线程不访问用户空间的内存,仅仅使用地址空间中和内核内存相关的信息, 这些信息的含义和普通进程完全相同。*/next->active_mm=oldmm;
atomic_inc(&oldmm->mm_count);
enter_lazy_tlb(oldmm, next);
} elseswitch_mm(oldmm, mm, next); /*体系架构相关的实现*/if (!prev->mm) { /*如果前一进程(即当前进程)是内核线程*//*这里为什么active_mm置为空? active_mm != mm 是类似flush TLB或mmdrop()判断条件。 下次该线程被调度回来时,该空值会被上面的赋值语句被直接覆盖掉。*/prev->active_mm=NULL;
rq->prev_mm=oldmm;
}
...
/* Here we just switch the register state and the stack. */switch_to(prev, next, prev);
barrier();
returnfinish_task_switch(prev);
}
/* * This struct defines a memory VMM memory area. There is one of these * per VM-area/task. A VM area is any part of the process virtual memory * space that has a special rule for the page-fault handlers (ie a shared * library, the executable area etc). */structvm_area_struct {
/* The first cache line has the info for VMA tree walking. */unsigned longvm_start; /* Our start address within vm_mm. */unsigned longvm_end; /* The first byte after our end address within vm_mm. *//* linked list of VM areas per task, sorted by address */structvm_area_struct*vm_next, *vm_prev;
structrb_nodevm_rb; /*红黑树上该VMA的节点*//* * Largest free memory gap in bytes to the left of this VMA. * Either between this VMA and vma->vm_prev, or between one of the * VMAs below us in the VMA rbtree and its ->vm_prev. This helps * get_unmapped_area find a free area of the right size. */unsigned longrb_subtree_gap;
/* Second cache line starts here. */structmm_struct*vm_mm; /* The address space we belong to. */pgprot_tvm_page_prot; /* Access permissions of this VMA. */unsigned longvm_flags; /* Flags, see mm.h. *//* * For areas with an address space and backing store, * linkage into the address_space->i_mmap interval tree. */struct {
structrb_noderb;
unsigned longrb_subtree_last;
} shared;
/* * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma * list, after a COW of one of the file pages. A MAP_SHARED vma * can only be in the i_mmap tree. An anonymous MAP_PRIVATE, stack * or brk vma (with NULL file) can only be in an anon_vma list. */structlist_headanon_vma_chain; /* Serialized by mmap_sem & * page_table_lock */structanon_vma*anon_vma; /* Serialized by page_table_lock *//* Function pointers to deal with this struct. */conststructvm_operations_struct*vm_ops;
/* Information about our backing store: */unsigned longvm_pgoff; /* Offset (within vm_file) in PAGE_SIZE units */structfile*vm_file; /* File we map to (can be NULL). */void*vm_private_data; /* was vm_pte (shared mem) */#ifndefCONFIG_MMUstructvm_region*vm_region; /* NOMMU mapping region */#endif#ifdefCONFIG_NUMAstructmempolicy*vm_policy; /* NUMA policy for the VMA */#endifstructvm_userfaultfd_ctxvm_userfaultfd_ctx;
};
/* * These are the virtual MM functions - opening of an area, closing and * unmapping it (needed to keep files on disk up-to-date etc), pointer * to the functions called when a no-page or a wp-page exception occurs. */structvm_operations_struct {
void (*open)(structvm_area_struct*area); /*指定内存区域被加入到一个地址空间时调用*/void (*close)(structvm_area_struct*area); /*指定内存区域从地址空间删除时调用*/int (*mremap)(structvm_area_struct*area);
int (*fault)(structvm_area_struct*vma, structvm_fault*vmf); /*当没有出现在物理内存中的页面被访问时,该函数被缺页处理函数调用*/int (*pmd_fault)(structvm_area_struct*, unsigned longaddress,
pmd_t*, unsigned intflags);
void (*map_pages)(structvm_area_struct*vma, structvm_fault*vmf);
/* notification that a previously read-only page is about to become * writable, if an error is returned it will cause a SIGBUS */int (*page_mkwrite)(structvm_area_struct*vma, structvm_fault*vmf); /*当某个页面为只读页面时,被缺页处理函数调用*//* same as page_mkwrite when using VM_PFNMAP|VM_MIXEDMAP */int (*pfn_mkwrite)(structvm_area_struct*vma, structvm_fault*vmf);
/* called by access_process_vm when get_user_pages() fails, typically * for use by special VMAs that can switch between memory and hardware */int (*access)(structvm_area_struct*vma, unsigned longaddr,
void*buf, intlen, intwrite);
/* Called by the /proc/PID/maps code to ask the vma whether it * has a special name. Returning non-NULL will also cause this * vma to be dumped unconditionally. */constchar*(*name)(structvm_area_struct*vma);
#ifdefCONFIG_NUMA/* * set_policy() op must add a reference to any non-NULL @new mempolicy * to hold the policy upon return. Caller should pass NULL @new to * remove a policy and fall back to surrounding context--i.e. do not * install a MPOL_DEFAULT policy, nor the task or system default * mempolicy. */int (*set_policy)(structvm_area_struct*vma, structmempolicy*new);
/* * get_policy() op must add reference [mpol_get()] to any policy at * (vma,addr) marked as MPOL_SHARED. The shared policy infrastructure * in mm/mempolicy.c will do this automatically. * get_policy() must NOT add a ref if the policy at (vma,addr) is not * marked as MPOL_SHARED. vma policies are protected by the mmap_sem. * If no [shared/vma] mempolicy exists at the addr, get_policy() op * must return NULL--i.e., do not "fallback" to task or system default * policy. */structmempolicy*(*get_policy)(structvm_area_struct*vma,
unsigned longaddr);
#endif/* * Called by vm_normal_page() for special PTEs to find the * page for @addr. This is useful if the default behavior * (using pte_page()) would not find the correct page. */structpage*(*find_special_page)(structvm_area_struct*vma,
unsigned longaddr);
};
VMA的树形结构和链表结构
可通过struct mm_struct的mmap和mm_rb域之一访问VMA。
这两个域 各自独立地 指向与内存描述符相关的 全体VMA对象。
包含完全相同的struct vm_area_struct的指针,仅仅组织方法不同。
mmap域用单链表连接所有的VMA。
每个struct vm_area_struct通过自身的vm_next域被连入链表。
所有的区域按照地址增长的方向排序。
mmap域指向链表中的第一个VMA,最后一个结构体指针指向空。
mm_rb域用红黑树连接所有的VMA。
mm_rb指向红黑树的根结点。
每个struct vm_area_struct通过自身的vm_rb域连接到树中。
链表用于 遍历,红黑树用于 定位 特定VMA。
操作内存区域
find_vma()
find_vma()在给定地址空间内找出 第一个vm_end大于addr的VMA。
注意,返回的VMA的首地址可能大于addr,所以addr并不一定在返回的VMA中。
mm/mmap.c
/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */structvm_area_struct*find_vma(structmm_struct*mm, unsigned longaddr)
{
structrb_node*rb_node;
structvm_area_struct*vma;
/* Check the cache first. *//*首先从该mm的mmap_cache中去找,如果找到了就返回。 struct task_struct有成员vm_area_struct *vmacache[VMACACHE_SIZE]存的vma cache。*/vma=vmacache_find(mm, addr);
if (likely(vma))
returnvma;
/*如果找不到,则在红黑树中找。*/rb_node=mm->mm_rb.rb_node;
while (rb_node) {
structvm_area_struct*tmp;
/*红黑树中取出VMA*/tmp=rb_entry(rb_node, structvm_area_struct, vm_rb);
if (tmp->vm_end>addr) { /*找到一个,但我们要的是第一个*/vma=tmp; /*不管是不是,先记录下来*/if (tmp->vm_start <= addr) /*addr落在区间内,必然是第一个*/break; /*结束查找*//*addr比vm_start还小,不在区间内,可能还有vm_end更小的VMA,还得继续找*/rb_node=rb_node->rb_left;
} else/*当前VMA的vm_end已然比addr小了,向红黑树的右方向继续查找。 如果右边节点为空,则上一次循环已经记录的vma就是要找的VMA。 否则重复这个过程。*/rb_node=rb_node->rb_right;
}
if (vma)
vmacache_update(addr, vma);
returnvma;
}
EXPORT_SYMBOL(find_vma);
/* * Same as find_vma, but also return a pointer to the previous VMA in *pprev. */structvm_area_struct*find_vma_prev(structmm_struct*mm, unsigned longaddr,
structvm_area_struct**pprev)
{
structvm_area_struct*vma;
vma=find_vma(mm, addr);
if (vma) {
*pprev=vma->vm_prev;
} else {
/*当没有比addr大的vm_end时,树的最右节点就是第一个小于addr的VMA*/structrb_node*rb_node=mm->mm_rb.rb_node;
*pprev=NULL;
while (rb_node) {
*pprev=rb_entry(rb_node, structvm_area_struct, vm_rb);
rb_node=rb_node->rb_right;
}
}
returnvma;
}