Linux read的核心函数generic_file_buffered

内核：5.9.0

流程图

generic_file_buffered_read一种调用路径（cat某个文件触发）：

#0  ondemand_readahead (mapping=0xffff888005c61340, ra=0xffff888005927598, filp=0xffff888005927500, hit_readahead_marker=false, index=0, req_size=16) at mm/readahead.c:445
#1  0xffffffff812eeea1 in page_cache_sync_readahead (req_count=<optimized out>, index=<optimized out>, filp=<optimized out>, ra=<optimized out>, mapping=<optimized out>) at mm/readahead.c:585
#2  page_cache_sync_readahead (mapping=<optimized out>, ra=0xffff888005927598, filp=0xffff888005927500, index=<optimized out>, req_count=16) at mm/readahead.c:567
#3  0xffffffff812dcae7 in generic_file_buffered_read (iocb=0xffffc90000033cc0, iter=<optimized out>, written=0) at mm/filemap.c:2199
#4  0xffffffff812dd8ed in generic_file_read_iter (iocb=0xffffc90000033cc0, iter=0xffffc90000033c98) at mm/filemap.c:2507
#5  0xffffffff814c7fc9 in ext4_file_read_iter (to=<optimized out>, iocb=<optimized out>) at fs/ext4/file.c:131
#6  ext4_file_read_iter (iocb=0xffffc90000033cc0, to=0xffffc90000033c98) at fs/ext4/file.c:114
#7  0xffffffff81405c0f in call_read_iter (file=<optimized out>, iter=<optimized out>, kio=<optimized out>) at ./include/linux/fs.h:1876
#8  generic_file_splice_read (in=0xffff888005927500, ppos=0xffffc90000033da8, pipe=<optimized out>, len=<optimized out>, flags=<optimized out>) at fs/splice.c:312
#9  0xffffffff81407b51 in do_splice_to (in=0xffff888005927500, ppos=0xffffc90000033da8, pipe=0xffff8880058fb6c0, len=65536, flags=<optimized out>) at fs/splice.c:890
#10 0xffffffff81407cab in splice_direct_to_actor (in=<optimized out>, sd=0xffffc90000033e00, actor=<optimized out>) at fs/splice.c:970
#11 0xffffffff81408012 in do_splice_direct (in=<optimized out>, ppos=0xffffc90000033ea8, out=0xffff888005927400, opos=0xffffc90000033eb0, len=16777216, flags=<optimized out>) at fs/splice.c:1079
#12 0xffffffff813ae9b1 in do_sendfile (out_fd=<optimized out>, in_fd=<optimized out>, ppos=0x0 <fixed_percpu_data>, count=<optimized out>, max=<optimized out>) at fs/read_write.c:1548
#13 0xffffffff813af30b in __do_sys_sendfile64 (count=<optimized out>, offset=<optimized out>, in_fd=<optimized out>, out_fd=<optimized out>) at fs/read_write.c:1609
#14 __se_sys_sendfile64 (count=<optimized out>, offset=<optimized out>, in_fd=<optimized out>, out_fd=<optimized out>) at fs/read_write.c:1595

generic_file_buffered_read核心逻辑：

尝试从page cache（address_space数据结构）中查找，如果命中返回。
如果未在page cache中，readpage从磁盘读取数据到page cache中。
按需（不是所有场景下才会预读）预读（readahead)，提升IO性能。


/**
 * generic_file_buffered_read - generic file read routine
 * @iocb:	the iocb to read
 * @iter:	data destination
 * @written:	already copied
 *
 * This is a generic file read routine, and uses the
 * mapping->a_ops->readpage() function for the actual low-level stuff.
 *
 * 本段代码自己注释说逻辑实现的真的“丑陋”....
 * This is really ugly. But the goto's actually try to clarify some
 * of the logic when it comes to error handling etc.
 *
 * Return:
 * * total number of bytes copied, including those the were already @written
 * * negative error code if nothing was copied
 */

ssize_t generic_file_buffered_read(struct kiocb *iocb,
		struct iov_iter *iter, ssize_t written)
{
	struct file *filp = iocb->ki_filp;
	struct address_space *mapping = filp->f_mapping;
	struct inode *inode = mapping->host;
    //预读的数据结构，预读后面会有专门的文章分析
	struct file_ra_state *ra = &filp->f_ra;

    //文件读取位置，即在文件内偏移量，字节为单位
	loff_t *ppos = &iocb->ki_pos;
	pgoff_t index;
	pgoff_t last_index;
	pgoff_t prev_index;
	unsigned long offset;      /* offset into pagecache page */
	unsigned int prev_offset;
	int error = 0;

	if (unlikely(*ppos >= inode->i_sb->s_maxbytes))
		return 0;
	iov_iter_truncate(iter, inode->i_sb->s_maxbytes);

    //把文档按4K分成数据页，index代表是数据页索引
    //|----index 0----|----index 1----|----index 2----|
    //       4K             4K              4K
	index = *ppos >> PAGE_SHIFT;

    //上次预读的位置
	prev_index = ra->prev_pos >> PAGE_SHIFT;
	prev_offset = ra->prev_pos & (PAGE_SIZE-1);

    //iter->count 要读取得字节数，last_index是要读取得最后一个index
	last_index = (*ppos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT;
	offset = *ppos & ~PAGE_MASK;

    //循环读取iter->count bytes数据（即应用程序要求读取的数据）
	for (;;) {
		struct page *page;
		pgoff_t end_index;
		loff_t isize;
		unsigned long nr, ret;

		cond_resched();
find_page:
		if (fatal_signal_pending(current)) {
			error = -EINTR;
			goto out;
		}
        //mapping指向的address_space缓存中，根据index查找是否已经有缓存数据
		page = find_get_page(mapping, index);
		if (!page) {
            //如果执行了不进行时机的磁盘io操作（必须在cache中）返回错误
			if (iocb->ki_flags & IOCB_NOIO)
				goto would_block;
            //"同步预读",注意，这里同步并非真的同步等待，本质上也是向block layer
            //提交一个io请求就返回了
			page_cache_sync_readahead(mapping,
					ra, filp,
					index, last_index - index);
			page = find_get_page(mapping, index);

            //如果没有成功（因为sync_readahead给page申请内存时，不会进入慢路径，所以
            //有可能失败。失败后goto no_cache_page：分配page页面，并readpage进行磁盘io操作
			if (unlikely(page == NULL))
				goto no_cached_page;
		}

        //走到这里，代表readahead成功,也就是当前想读取的页面，已经被预读了
        //理解这句话要看readahead机制和数据结构，PageReadahead page是怎么设置的
		if (PageReadahead(page)) {
			if (iocb->ki_flags & IOCB_NOIO) {
				put_page(page);
				goto out;
			}
            //触发一次异步read_ahead,因为异步读成功命中，重新取预读更多的页面，也会
            //更新ra数据结构
			page_cache_async_readahead(mapping,
					ra, filp, page,
					index, last_index - index);
		}

        //如果页面数据无效，比如系统I/O非常繁忙，上面异步读没有完成，这里就需要等待了
		if (!PageUptodate(page)) {
			/*
			 * See comment in do_read_cache_page on why
			 * wait_on_page_locked is used to avoid unnecessarily
			 * serialisations and why it's safe.
			 */
            //IOCB_WAITQ代表异步io，比如io_uring
			if (iocb->ki_flags & IOCB_WAITQ) {
				if (written) {
					put_page(page);
					goto out;
				}
				error = wait_on_page_locked_async(page,
								iocb->ki_waitq);
			} else {
                //指定不等待，就返回错误
				if (iocb->ki_flags & IOCB_NOWAIT) {
					put_page(page);
					goto would_block;
				}
                //block等待IO完成，systrace上显示进程block i/o就是等在此处。
				error = wait_on_page_locked_killable(page);
			}
			if (unlikely(error))
				goto readpage_error;

            //上面的等待成功了
			if (PageUptodate(page))
				goto page_ok;

			if (inode->i_blkbits == PAGE_SHIFT ||
					!mapping->a_ops->is_partially_uptodate)
				goto page_not_up_to_date;
			/* pipes can't handle partially uptodate pages */
			if (unlikely(iov_iter_is_pipe(iter)))
				goto page_not_up_to_date;
			if (!trylock_page(page))
				goto page_not_up_to_date;
			/* Did it get truncated before we got the lock? */
			if (!page->mapping)
				goto page_not_up_to_date_locked;
			if (!mapping->a_ops->is_partially_uptodate(page,
							offset, iter->count))
				goto page_not_up_to_date_locked;
			unlock_page(page);
		}
page_ok:
		/*
		 * i_size must be checked after we know the page is Uptodate.
		 *
		 * Checking i_size after the check allows us to calculate
		 * the correct value for "nr", which means the zero-filled
		 * part of the page is not copied back to userspace (unless
		 * another truncate extends the file - this is desired though).
		 */

		isize = i_size_read(inode);
		end_index = (isize - 1) >> PAGE_SHIFT;
		if (unlikely(!isize || index > end_index)) {
			put_page(page);
			goto out;
		}

		/* nr is the maximum number of bytes to copy from this page */
		nr = PAGE_SIZE;
		if (index == end_index) {
			nr = ((isize - 1) & ~PAGE_MASK) + 1;
			if (nr <= offset) {
				put_page(page);
				goto out;
			}
		}
		nr = nr - offset;

		/* If users can be writing to this page using arbitrary
		 * virtual addresses, take care about potential aliasing
		 * before reading the page on the kernel side.
		 */
		if (mapping_writably_mapped(mapping))
			flush_dcache_page(page);

		/*
		 * When a sequential read accesses a page several times,
		 * only mark it as accessed the first time.
		 */
		if (prev_index != index || offset != prev_offset)
			mark_page_accessed(page);
		prev_index = index;

		/*
		 * Ok, we have the page, and it's up-to-date, so
		 * now we can copy it to user space...
		 */

        //数据copy给用户空间，iter->count会减去copy的数据大小
		ret = copy_page_to_iter(page, offset, nr, iter);
		offset += ret;
		index += offset >> PAGE_SHIFT;
		offset &= ~PAGE_MASK;
		prev_offset = offset;

		put_page(page);
		written += ret;
        //如果iter->count = 0，即已经完成数据读取goto out跳出循环。
		if (!iov_iter_count(iter))
			goto out;
		if (ret < nr) {
			error = -EFAULT;
			goto out;
		}
		continue;

page_not_up_to_date:
		/* Get exclusive access to the page ... */
		if (iocb->ki_flags & IOCB_WAITQ)
			error = lock_page_async(page, iocb->ki_waitq);
		else
			error = lock_page_killable(page);
		if (unlikely(error))
			goto readpage_error;

page_not_up_to_date_locked:
		/* Did it get truncated before we got the lock? */
		if (!page->mapping) {
			unlock_page(page);
			put_page(page);
			continue;
		}

		/* Did somebody else fill it already? */
		if (PageUptodate(page)) {
			unlock_page(page);
			goto page_ok;
		}

readpage:
        //low-level page read，即调用readpage触发磁盘i/o
		if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT)) {
			unlock_page(page);
			put_page(page);
			goto would_block;
		}
		/*
		 * A previous I/O error may have been due to temporary
		 * failures, eg. multipath errors.
		 * PG_error will be set again if readpage fails.
		 */
		ClearPageError(page);
		/* Start the actual read. The read will unlock the page. */
		error = mapping->a_ops->readpage(filp, page);

		if (unlikely(error)) {
			if (error == AOP_TRUNCATED_PAGE) {
				put_page(page);
				error = 0;
				goto find_page;
			}
			goto readpage_error;
		}

		if (!PageUptodate(page)) {
			if (iocb->ki_flags & IOCB_WAITQ)
				error = lock_page_async(page, iocb->ki_waitq);
			else
				error = lock_page_killable(page);

			if (unlikely(error))
				goto readpage_error;
			if (!PageUptodate(page)) {
				if (page->mapping == NULL) {
					/*
					 * invalidate_mapping_pages got it
					 */
					unlock_page(page);
					put_page(page);
					goto find_page;
				}
				unlock_page(page);
				shrink_readahead_size_eio(ra);
				error = -EIO;
				goto readpage_error;
			}
			unlock_page(page);
		}

		goto page_ok;

readpage_error:
		/* UHHUH! A synchronous read error occurred. Report it */
		put_page(page);
		goto out;

no_cached_page:
        //address space没有缓存页面，alloc page,然后插入cache中。
		/*
		 * Ok, it wasn't cached, so we need to create a new
		 * page..
		 */
		page = page_cache_alloc(mapping);
		if (!page) {
			error = -ENOMEM;
			goto out;
		}
		error = add_to_page_cache_lru(page, mapping, index,
				mapping_gfp_constraint(mapping, GFP_KERNEL));
		if (error) {
			put_page(page);
			if (error == -EEXIST) {
				error = 0;
				goto find_page;
			}
			goto out;
		}
		goto readpage;
	}

would_block:
	error = -EAGAIN;
out:
	ra->prev_pos = prev_index;
	ra->prev_pos <<= PAGE_SHIFT;
	ra->prev_pos |= prev_offset;

	*ppos = ((loff_t)index << PAGE_SHIFT) + offset;
	file_accessed(filp);
	return written ? written : error;
}

重点提示：代码中为什么先触发一次sync_readahead，再触发一次async_readahead？

如果代码注释的一样，这两个函数命令很容易误解，其实这两个从应用调用的角度来看，都是”异步"的，以为他们都只是向block layer submit io，所以是异步，从代码上下文也能推测出来，比如sync_readahead调用完之后，紧接调用了find_get_page还是判断了page == NULL的情况，从这点就充分看出来sync_readahead也是异步调用。
如果系统io情况正常，100%顺序读取的情况下，sync_readahead调用一次，触发了预读，后面再顺序读就不会触发sync_readhead了，而是触发async_readahead，async_readahead触发的条件是要读取的page满足PageReadahead(page)，即预读页面被命中了。就像网络协议栈中的滑动窗口，既然请求已经来到预读的窗口中，那么就要重新触发新的预读，预读更多的页面。

具体上面这段话不理解的话，参考我后面readahead算法介绍的文章。

参考文章：

Linux 通用块层 bio 详解 – 字节岛技术分享

Life of an ext4 write request - Ext4

深入分析Linux内核File cache机制（上篇） - 知乎

Linux内核中跟踪文件PageCache预读_读取