mm 是 Linux 0.11 内存管理的模块,一共两个文件 memory.c 与 page.s。开篇先来“再续前缘”,继续探讨写时复制技术的后半部分。
写时复制之页错误
上一篇文章提到了,当父/子进程其中之一对只读的内存页面进行写操作时,会产生页错误的异常,该异常处理程序负责将共享的内存页面复制到新内存页中,并重新构建该页表项,使其指向新内存页并可写。实际上,页错误异常不仅由写保护引发,还有可能是缺页引起的。页错误异常就定义在 page.s 中,该文件也就只有 page_fault 的代码:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29
| .globl page_fault /* 引发页错误的线性地址保存在控制寄存器 CR2 中 */ page_fault: xchgl %eax,(%esp) /* 将出错码取到 eax 中 */ pushl %ecx pushl %edx push %ds push %es push %fs /* 保存现场 */ movl $0x10,%edx mov %dx,%ds mov %dx,%es mov %dx,%fs /* 修改段寄存器,指向内核数据段 */ movl %cr2,%edx /* 将引起页错误的线性地址放到 edx 中 */ pushl %edx pushl %eax /* 压参(页错误线性地址与错误码) */ testl $1,%eax /* 页存在 P 位如果不为 0,表明不是由缺页引起的异常 */ jne 1f /* 而是由写保护引发的异常,跳去调用 do_wp_page */ call do_no_page /* 如果是缺页引发的异常,则调用 do_no_page */ jmp 2f 1: call do_wp_page 2: addl $8,%esp /* 栈平衡 */ pop %fs pop %es pop %ds popl %edx popl %ecx popl %eax /* 还原现场 */ iret
|
先来看由写保护引起的异常处理函数 do_wp_page(之后涉及的函数都在 memory.c 中)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38
| void do_wp_page(unsigned long error_code,unsigned long address) { #if 0 if (CODE_SPACE(address)) do_exit(SIGSEGV); #endif un_wp_page((unsigned long *) (((address>>10) & 0xffc) + (0xfffff000 & *((unsigned long *) ((address>>20) &0xffc)))));
}
void un_wp_page(unsigned long * table_entry) { unsigned long old_page,new_page;
old_page = 0xfffff000 & *table_entry; if (old_page >= LOW_MEM && mem_map[MAP_NR(old_page)]==1) { *table_entry |= 2; invalidate(); return; } if (!(new_page=get_free_page())) oom(); if (old_page >= LOW_MEM) mem_map[MAP_NR(old_page)]--; *table_entry = new_page | 7; invalidate(); copy_page(old_page,new_page); }
#define copy_page(from,to) \ __asm__("cld ; rep ; movsl"::"S" (from),"D" (to),"c" (1024))
|
于是,写时复制的全貌就展现完毕了。由缺页引发的页错误处理涉及到块设备的知识,之后再做记录。
mem_map数组
之前涉及内存管理的代码都或多或少地有 mem_map 数组的影子,这个字符数组就是 Linux 用于判断 1MB 以上物理内存使用情况的,每个字节描述一个物理页面的占用状态,该字节的数值表示该页面被占用的次数,0 代表该页面空闲,100 代表该页面已被完全占用,不能再被分配/共享。Linux 0.11 的物理内存区域划分如下:
mm 模块中的几类函数
释放内存
接着来看 memory.c 中还剩下的一些函数,可根据功能分为几类,首先是释放内存:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43
|
int free_page_tables(unsigned long from,unsigned long size) { unsigned long *pg_table; unsigned long * dir, nr;
if (from & 0x3fffff) panic("free_page_tables called with wrong alignment"); if (!from) panic("Trying to free up swapper memory space"); size = (size + 0x3fffff) >> 22; dir = (unsigned long *) ((from>>20) & 0xffc); for ( ; size-->0 ; dir++) { if (!(1 & *dir)) continue; pg_table = (unsigned long *) (0xfffff000 & *dir); for (nr=0 ; nr<1024 ; nr++) { if (1 & *pg_table) free_page(0xfffff000 & *pg_table); *pg_table = 0; pg_table++; } free_page(0xfffff000 & *dir); *dir = 0; } invalidate(); return 0; }
void free_page(unsigned long addr) { if (addr < LOW_MEM) return; if (addr >= HIGH_MEMORY) panic("trying to free nonexistent page"); addr -= LOW_MEM; addr >>= 12; if (mem_map[addr]--) return; mem_map[addr]=0; panic("trying to free free page"); }
|
获取空闲页面
第二类有关获取空闲页面
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64
|
void get_empty_page(unsigned long address) { unsigned long tmp;
if (!(tmp=get_free_page()) || !put_page(tmp,address)) { free_page(tmp); oom(); } }
unsigned long get_free_page(void) { register unsigned long __res asm("ax");
__asm__("std ; repne ; scasb\n\t" "jne 1f\n\t" "movb $1,1(%%edi)\n\t" "sall $12,%%ecx\n\t" "addl %2,%%ecx\n\t" "movl %%ecx,%%edx\n\t" "movl $1024,%%ecx\n\t" "leal 4092(%%edx),%%edi\n\t" "rep ; stosl\n\t" "movl %%edx,%%eax\n\t" "1:" "cld\n\t" :"=a" (__res) :"0" (0),"i" (LOW_MEM),"c" (PAGING_PAGES), "D" (mem_map+PAGING_PAGES-1) ); return __res; }
unsigned long put_page(unsigned long page,unsigned long address) { unsigned long tmp, *page_table; if (page < LOW_MEM || page >= HIGH_MEMORY) printk("Trying to put page %p at %p\n",page,address); if (mem_map[(page-LOW_MEM)>>12] != 1) printk("mem_map disagrees with %p at %p\n",page,address); page_table = (unsigned long *) ((address>>20) & 0xffc); if ((*page_table)&1) page_table = (unsigned long *) (0xfffff000 & *page_table); else { if (!(tmp=get_free_page())) return 0; *page_table = tmp|7; page_table = (unsigned long *) tmp; } page_table[(address>>12) & 0x3ff] = page | 7; return page; }
|
共享内存
第三类有关共享内存,share_page 函数仅被缺页处理函数 do_no_page 调用。这里引入一个新概念——页面逻辑地址,意为该页面地址是以进程的代码/数据起始地址算起的页面地址。以下是 do_no_page 部分代码:
1 2 3 4 5 6 7 8
|
address &= 0xfffff000; tmp = address - current->start_code; if (share_page(tmp)) return;
|
share_page 的具体实现:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68
|
static int share_page(unsigned long address) { struct task_struct ** p;
if (!current->executable) return 0; if (current->executable->i_count < 2) return 0; for (p = &LAST_TASK ; p > &FIRST_TASK ; --p) { if (!*p) continue; if (current == *p) continue; if ((*p)->executable != current->executable) continue; if (try_to_share(address,*p)) return 1; } return 0; }
static int try_to_share(unsigned long address, struct task_struct * p) { unsigned long from; unsigned long to; unsigned long from_page; unsigned long to_page; unsigned long phys_addr;
from_page = to_page = ((address>>20) & 0xffc); from_page += ((p->start_code>>20) & 0xffc); to_page += ((current->start_code>>20) & 0xffc); from = *(unsigned long *) from_page; if (!(from & 1)) return 0; from &= 0xfffff000; from_page = from + ((address>>10) & 0xffc); phys_addr = *(unsigned long *) from_page; if ((phys_addr & 0x41) != 0x01) return 0; phys_addr &= 0xfffff000; if (phys_addr >= HIGH_MEMORY || phys_addr < LOW_MEM) return 0; to = *(unsigned long *) to_page; if (!(to & 1)) if (to = get_free_page()) *(unsigned long *) to_page = to | 7; else oom(); to &= 0xfffff000; to_page = to + ((address>>10) & 0xffc); if (1 & *(unsigned long *) to_page) panic("try_to_share: to_page already exists"); *(unsigned long *) from_page &= ~2; *(unsigned long *) to_page = *(unsigned long *) from_page; invalidate(); phys_addr -= LOW_MEM; phys_addr >>= 12; mem_map[phys_addr]++; return 1; }
|
初始化函数
第四类是 main.c 中调用的 mem_init 初始化函数:
1 2 3 4 5 6 7 8 9 10 11 12 13 14
| void mem_init(long start_mem, long end_mem) { int i;
HIGH_MEMORY = end_mem; for (i=0 ; i<PAGING_PAGES ; i++) mem_map[i] = USED; i = MAP_NR(start_mem); end_mem -= start_mem; end_mem >>= 12; while (end_mem-->0) mem_map[i++]=0; }
|
其他
最后是一些杂项:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
|
void write_verify(unsigned long address) { unsigned long page;
if (!( (page = *((unsigned long *) ((address>>20) & 0xffc)) )&1)) return; page &= 0xfffff000; page += ((address>>10) & 0xffc); if ((3 & *(unsigned long *) page) == 1) un_wp_page((unsigned long *) page); return; }
void calc_mem(void) { int i,j,k,free=0; long * pg_tbl;
for(i=0 ; i<PAGING_PAGES ; i++) if (!mem_map[i]) free++; printk("%d pages free (of %d)\n\r",free,PAGING_PAGES); for(i=2 ; i<1024 ; i++) { if (1&pg_dir[i]) { pg_tbl=(long *) (0xfffff000 & pg_dir[i]); for(j=k=0 ; j<1024 ; j++) if (pg_tbl[j]&1) k++; printk("Pg-dir[%d] uses %d pages\n",i,k); } } }
|