开始 fs 模块之前,我发现如果对块设备/字符设备的驱动程序不了解的话,读 fs 代码时会困难重重。为了简化问题,本文及之后的 fs 模块都将只记录关于块设备(特指硬盘)的代码,先弄懂一个,剩下的读起来就轻松了。阅读本文或许会一头雾水,但和下篇文章联系起来看就会清楚许多了(x
块设备操作方式(以读数据为例)
提到 I/O 先来看一张图:
当程序需要从硬盘中读取数据(read 系统调用)时,缓冲区管理程序会先查询该数据块是否已经读入到缓冲区中。如果是,则直接将该缓冲头(涉及高速缓冲的管理方式,下篇文章将会记录)返回并唤醒等待此数据块的进程;否则调用 ll_rw_block 函数,告诉块设备驱动程序(内核代码)现在需要读数据块,该函数就会为其创建一个请求项,并挂入相应设备的请求队列,同时发出请求的进程会被挂起(不可中断睡眠态)。
当请求被处理时,设备控制器根据请求项中的参数,向硬盘驱动器发送读指令,硬盘驱动器就会将数据读取到设备控制器的缓冲区中(注意此时原发出读盘请求的进程已被挂起,CPU 正在被其他进程占用)。当设备控制器检测到数据读取完毕,就会产生一个中断请求信号发往 CPU,CPU 在硬盘中断处理程序 hd_interrupt 中调用 read_intr 函数将数据从设备控制器的缓冲区搬到内存的高速缓冲区中,并让设备控制器开始处理下一个请求(如果有的话)。最后内核将高速缓冲中的数据拷贝到调用 read 函数时第二个参数指向的地址中去。用一张图来总结:
请求项与请求队列
请求项
请求项的数据结构如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
| struct request { int dev; int cmd; int errors; unsigned long sector; unsigned long nr_sectors; char * buffer; struct task_struct * waiting; struct buffer_head * bh; struct request * next; };
struct request request[NR_REQUEST];
|
为什么请求项已经可以通过 next 指针构成单项链表了,还需要一个数组来维护呢?采用数组加链表结构其实是为了满足两个目的:
- 数组结构使得在搜索空闲请求项的时候可以进行循环操作,搜索访问时间复杂度为常数
- 链表结构是为了满足电梯算法插入请求项的操作
请求队列
对于各种块设备,内核使用块设备表 blk_dev 来管理,每种块设备在块设备表中占有一项,相关数据结构如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
|
struct blk_dev_struct { void (*request_fn)(void); struct request * current_request; };
struct blk_dev_struct blk_dev[NR_BLK_DEV] = { { NULL, NULL }, { NULL, NULL }, { NULL, NULL }, { NULL, NULL }, { NULL, NULL }, { NULL, NULL }, { NULL, NULL } };
|
再来通过一张图直观地感受这些数据结构之间的关系:
通过之前的描述不难看出,硬盘设备有 4 个请求,软盘设备有 1 个请求,虚拟盘暂无请求。下面正式开始块设备(仅硬盘)驱动程序部分源码的阅读
blk.h
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96
| #ifndef _BLK_H #define _BLK_H
#define NR_BLK_DEV 7 #define NR_REQUEST 32
struct request { int dev; int cmd; int errors; unsigned long sector; unsigned long nr_sectors; char * buffer; struct task_struct * waiting; struct buffer_head * bh; struct request * next; };
#define IN_ORDER(s1,s2) \ ((s1)->cmd<(s2)->cmd || (s1)->cmd==(s2)->cmd && \ ((s1)->dev < (s2)->dev || ((s1)->dev == (s2)->dev && \ (s1)->sector < (s2)->sector)))
struct blk_dev_struct { void (*request_fn)(void); struct request * current_request; };
extern struct blk_dev_struct blk_dev[NR_BLK_DEV]; extern struct request request[NR_REQUEST]; extern struct task_struct * wait_for_request;
#ifdef MAJOR_NR ... #elif (MAJOR_NR == 3) #define DEVICE_NAME "harddisk" #define DEVICE_INTR do_hd #define DEVICE_REQUEST do_hd_request #define DEVICE_NR(device) (MINOR(device)/5) #define DEVICE_ON(device) #define DEVICE_OFF(device) #elif (MAJOR_NR > 3) #error "unknown blk device" #endif
#define CURRENT (blk_dev[MAJOR_NR].current_request) #define CURRENT_DEV DEVICE_NR(CURRENT->dev) #ifdef DEVICE_INTR void (*DEVICE_INTR)(void) = NULL; #endif static void (DEVICE_REQUEST)(void);
static inline void unlock_buffer(struct buffer_head * bh) { if (!bh->b_lock) printk(DEVICE_NAME ": free buffer being unlocked\n"); bh->b_lock=0; wake_up(&bh->b_wait); }
static inline void end_request(int uptodate) { DEVICE_OFF(CURRENT->dev); if (CURRENT->bh) { CURRENT->bh->b_uptodate = uptodate; unlock_buffer(CURRENT->bh); } if (!uptodate) { printk(DEVICE_NAME " I/O error\n\r"); printk("dev %04x, block %d\n\r",CURRENT->dev, CURRENT->bh->b_blocknr); } wake_up(&CURRENT->waiting); wake_up(&wait_for_request); CURRENT->dev = -1; CURRENT = CURRENT->next; }
#define INIT_REQUEST \ repeat: \ if (!CURRENT) \ return; \ if (MAJOR(CURRENT->dev) != MAJOR_NR) \ panic(DEVICE_NAME ": request list destroyed"); \ if (CURRENT->bh) { \ if (!CURRENT->bh->b_lock) \ panic(DEVICE_NAME ": block not locked"); \ }
#endif #endif
|
hd.c
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58
| #include <linux/config.h> #include <linux/sched.h> #include <linux/fs.h> #include <linux/kernel.h> #include <linux/hdreg.h> #include <asm/system.h> #include <asm/io.h> #include <asm/segment.h>
#define MAJOR_NR 3 #include "blk.h"
#define CMOS_READ(addr) ({ \ outb_p(0x80|addr,0x70); \ inb_p(0x71); \ })
#define MAX_ERRORS 7 #define MAX_HD 2
static void recal_intr(void);
static int recalibrate = 0; static int reset = 0;
struct hd_i_struct { int head,sect,cyl,wpcom,lzone,ctl; };
#ifdef HD_TYPE struct hd_i_struct hd_info[] = { HD_TYPE }; #define NR_HD ((sizeof (hd_info))/(sizeof (struct hd_i_struct))) #else struct hd_i_struct hd_info[] = { {0,0,0,0,0,0},{0,0,0,0,0,0} }; static int NR_HD = 0; #endif
static struct hd_struct { long start_sect; long nr_sects; } hd[5*MAX_HD]={{0,0},};
#define port_read(port,buf,nr) \ __asm__("cld;rep;insw"::"d" (port),"D" (buf),"c" (nr))
#define port_write(port,buf,nr) \ __asm__("cld;rep;outsw"::"d" (port),"S" (buf),"c" (nr))
extern void hd_interrupt(void); extern void rd_load(void);
|
接下来是在 init 函数中调用的 setup 系统调用,参数 BIOS 是 setup.s 程序取得并放置在 0x90080 处的包含两个硬盘参数的硬盘参数表指针(大小为 32 字节)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79
| int sys_setup(void * BIOS) { static int callable = 1; int i,drive; unsigned char cmos_disks; struct partition *p; struct buffer_head * bh;
if (!callable) return -1; callable = 0; #ifndef HD_TYPE for (drive=0 ; drive<2 ; drive++) { hd_info[drive].cyl = *(unsigned short *) BIOS; hd_info[drive].head = *(unsigned char *) (2+BIOS); hd_info[drive].wpcom = *(unsigned short *) (5+BIOS); hd_info[drive].ctl = *(unsigned char *) (8+BIOS); hd_info[drive].lzone = *(unsigned short *) (12+BIOS); hd_info[drive].sect = *(unsigned char *) (14+BIOS); BIOS += 16; } if (hd_info[1].cyl) NR_HD=2; else NR_HD=1; #endif for (i=0 ; i<NR_HD ; i++) { hd[i*5].start_sect = 0; hd[i*5].nr_sects = hd_info[i].head* hd_info[i].sect*hd_info[i].cyl; }
if ((cmos_disks = CMOS_READ(0x12)) & 0xf0) if (cmos_disks & 0x0f) NR_HD = 2; else NR_HD = 1; else NR_HD = 0; for (i = NR_HD ; i < 2 ; i++) { hd[i*5].start_sect = 0; hd[i*5].nr_sects = 0; } for (drive=0 ; drive<NR_HD ; drive++) { if (!(bh = bread(0x300 + drive*5,0))) { printk("Unable to read partition table of drive %d\n\r", drive); panic(""); } if (bh->b_data[510] != 0x55 || (unsigned char) bh->b_data[511] != 0xAA) { printk("Bad partition table on drive %d\n\r",drive); panic(""); } p = 0x1BE + (void *)bh->b_data; for (i=1;i<5;i++,p++) { hd[i+5*drive].start_sect = p->start_sect; hd[i+5*drive].nr_sects = p->nr_sects; } brelse(bh); } if (NR_HD) printk("Partition table%s ok.\n\r",(NR_HD>1)?"s":""); rd_load(); mount_root(); return (0); }
|
然后是 main 函数中调用的 hd_init 函数
1 2 3 4 5 6 7 8 9
| void hd_init(void) { blk_dev[MAJOR_NR].request_fn = DEVICE_REQUEST; set_intr_gate(0x2E,&hd_interrupt); outb_p(inb_p(0x21)&0xfb,0x21); outb(inb_p(0xA1)&0xbf,0xA1); }
|
向硬盘控制器发送命令的函数 hd_out,参数 drive 是驱动器号,nsect 是读写扇区数,sect 是起始盘区,head 是磁头号,cyl 是柱面号,cmd 是命令码,intr_addr 类型为函数指针,调用时此处需要传一个函数名,该函数将在硬盘中断处理程序中被调用(类似 hook)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
| static void hd_out(unsigned int drive,unsigned int nsect,unsigned int sect, unsigned int head,unsigned int cyl,unsigned int cmd, void (*intr_addr)(void)) { register int port asm("dx");
if (drive>1 || head>15) panic("Trying to write bad sector"); if (!controller_ready()) panic("HD controller not ready"); do_hd = intr_addr; outb_p(hd_info[drive].ctl,HD_CMD); port=HD_DATA; outb_p(hd_info[drive].wpcom>>2,++port); outb_p(nsect,++port); outb_p(sect,++port); outb_p(cyl,++port); outb_p(cyl>>8,++port); outb_p(0xA0|(drive<<4)|head,++port); outb(cmd,++port); }
|
处理硬盘当前请求项的函数 do_hd_request
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
| void do_hd_request(void) { int i,r; unsigned int block,dev; unsigned int sec,head,cyl; unsigned int nsect;
INIT_REQUEST; dev = MINOR(CURRENT->dev); block = CURRENT->sector; if (dev >= 5*NR_HD || block+2 > hd[dev].nr_sects) { end_request(0); goto repeat; } block += hd[dev].start_sect; dev /= 5; __asm__("divl %4":"=a" (block),"=d" (sec):"0" (block),"1" (0), "r" (hd_info[dev].sect)); __asm__("divl %4":"=a" (cyl),"=d" (head):"0" (block),"1" (0), "r" (hd_info[dev].head)); sec++; nsect = CURRENT->nr_sectors; if (reset) { reset = 0; recalibrate = 1; reset_hd(CURRENT_DEV); return; } if (recalibrate) { recalibrate = 0; hd_out(dev,hd_info[CURRENT_DEV].sect,0,0,0, WIN_RESTORE,&recal_intr); return; } if (CURRENT->cmd == WRITE) { hd_out(dev,nsect,sec,head,cyl,WIN_WRITE,&write_intr); for(i=0 ; i<3000 && !(r=inb_p(HD_STATUS)&DRQ_STAT) ; i++) ; if (!r) { bad_rw_intr(); goto repeat; } port_write(HD_DATA,CURRENT->buffer,256); } else if (CURRENT->cmd == READ) { hd_out(dev,nsect,sec,head,cyl,WIN_READ,&read_intr); } else panic("unknown hd-command"); }
|
另一类函数是硬盘中断处理过程中可被调用的函数,它们有 read_intr、write_intr、bad_rw_intr、recal_intr
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
|
static void read_intr(void) { if (win_result()) { bad_rw_intr(); do_hd_request(); return; } port_read(HD_DATA,CURRENT->buffer,256); CURRENT->errors = 0; CURRENT->buffer += 512; CURRENT->sector++; if (--CURRENT->nr_sectors) { do_hd = &read_intr; return; } end_request(1); do_hd_request(); }
static void write_intr(void) { if (win_result()) { bad_rw_intr(); do_hd_request(); return; } if (--CURRENT->nr_sectors) { CURRENT->sector++; CURRENT->buffer += 512; do_hd = &write_intr; port_write(HD_DATA,CURRENT->buffer,256); return; } end_request(1); do_hd_request(); }
static void bad_rw_intr(void) { if (++CURRENT->errors >= MAX_ERRORS) end_request(0); if (CURRENT->errors > MAX_ERRORS/2) reset = 1; }
static void recal_intr(void) { if (win_result()) bad_rw_intr(); do_hd_request(); }
|
剩下的函数为操作硬盘控制器的辅助函数
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
|
static int controller_ready(void) { int retries=100000;
while (--retries && (inb_p(HD_STATUS)&0x80)); return (retries); }
static int win_result(void) { int i=inb_p(HD_STATUS);
if ((i & (BUSY_STAT | READY_STAT | WRERR_STAT | SEEK_STAT | ERR_STAT)) == (READY_STAT | SEEK_STAT)) return(0); if (i&1) i=inb(HD_ERROR); return (1); }
static int drive_busy(void) { unsigned int i;
for (i = 0; i < 10000; i++) if (READY_STAT == (inb_p(HD_STATUS) & (BUSY_STAT|READY_STAT))) break; i = inb(HD_STATUS); i &= BUSY_STAT | READY_STAT | SEEK_STAT; if (i == READY_STAT | SEEK_STAT) return(0); printk("HD controller times out\n\r"); return(1); }
static void reset_controller(void) { int i;
outb(4,HD_CMD); for(i = 0; i < 100; i++) nop(); outb(hd_info[0].ctl & 0x0f ,HD_CMD); if (drive_busy()) printk("HD-controller still busy\n\r"); if ((i = inb(HD_ERROR)) != 1) printk("HD-controller reset failed: %02x\n\r",i); }
|