注册 登录  
 加关注
   显示下一条  |  关闭
温馨提示!由于新浪微博认证机制调整,您的新浪微博帐号绑定已过期,请重新绑定!立即重新绑定新浪微博》  |  关闭

widebright的个人空间

// 编程和生活

 
 
 

日志

 
 

Linux 的free命令查看空闲内存的buffer和cached的值的意义,相关的内核源码  

2014-11-12 21:44:10|  分类: linux相关 |  标签: |举报 |字号 订阅

  下载LOFTER 我的照片书  |
看到网上有人提到这个,我也来初略看一下内核源码:



strace free看到 访问了这两个文件

/proc/stat
/proc/meminfo


cat /proc/meminfo 看一下  cached和free的信息应该来自这里

这个文件也有类似的信息
cat /proc/vmstat

http://lxr.free-electrons.com/source/fs/proc/meminfo.c#L220

 static const struct file_operations meminfo_proc_fops = {
         .open           = meminfo_proc_open,
         .read           = seq_read,
         .llseek         = seq_lseek,
         .release        = single_release,
 };


static int meminfo_proc_show(struct seq_file *m, void *v)
  struct sysinfo i;
#define K(x) ((x) << (PAGE_SHIFT - 10))
        si_meminfo(&i);
        si_swapinfo(&i);

                "MemTotal:       %8lu kB\n"
                "MemFree:        %8lu kB\n"
                "MemAvailable:   %8lu kB\n"
                "Buffers:        %8lu kB\n"
                "Cached:         %8lu kB\n"
                "SwapCached:     %8lu kB\n"

                K(i.totalram),
                K(i.freeram),
                K(available),
                K(i.bufferram),           //// buffer
                K(cached),                 //// cache
                K(total_swapcache_pages()),

        cached = global_page_state(NR_FILE_PAGES) -
                        total_swapcache_pages() - i.bufferram;


可以知道cached的内容就是





global_page_state  这个返回系统全局的page的状态为NR_FILE_PAGES的总数。

NR_FILE_PAGES是在
http://lxr.free-electrons.com/source/include/linux/mmzone.h
文件定义的一种枚举类型
 enum zone_stat_item { 


内核在page状态变化时会调用inc_zone_state  inc_zone_page_state  函数更新这个NR_FILE_PAGES统计次数

http://lxr.free-electrons.com/source/mm/swap_state.c
     int __add_to_swap_cache(struct page *page, swp_entry_t entry)
     void __delete_from_swap_cache(struct page *page)

http://lxr.free-electrons.com/source/mm/filemap.c
     int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
                pgoff_t offset, gfp_t gfp_mask)

struct page *find_or_create_page(struct address_space *mapping,
                  pgoff_t index, gfp_t gfp_mask)
 /*
 * Find or create a page at the given pagecache position. Return the locked
 * page. This function is specifically for buffered writes.
 */

grab_cache_page_write_begin
   pagecache_get_page(
int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
                                 pgoff_t offset, gfp_t gfp_mask)
void __delete_from_page_cache(struct page *page, void *shadow)



http://lxr.free-electrons.com/source/mm/vmscan.c
static long zone_pagecache_reclaimable(struct zone *zone)


等函数都有更新这个计数

看看内核代码引用
do_generic_file_read
page_cache_read
pagecache_get_page
  page_cache_alloc_cold
  add_to_page_cache_lru  ->  __add_to_page_cache_locked  -》page_cache_tree_insert  会更新 struct address_space *mapping和 NR_FILE_PAGES的计数。
    
读文件的cache都是在这个统计NR_FILE_PAGES的统计里面的。



 44         cached = global_page_state(NR_FILE_PAGES) -
 45                         total_swapcache_pages() - i.bufferram;


total_swapcache_pages()应该是除去swap交换到磁盘的page了。 i.bufferram; 这个到底是什么东西呢,来
看看。

http://lxr.free-electrons.com/source/include/uapi/linux/sysinfo.h

struct sysinfo {
        __kernel_long_t uptime;         /* Seconds since boot */
        __kernel_ulong_t loads[3];      /* 1, 5, and 15 minute load averages */
        __kernel_ulong_t totalram;      /* Total usable main memory size */
        __kernel_ulong_t freeram;       /* Available memory size */
        __kernel_ulong_t sharedram;     /* Amount of shared memory */
        __kernel_ulong_t bufferram;     /* Memory used by buffers */
        __kernel_ulong_t totalswap;     /* Total swap space size */
        __kernel_ulong_t freeswap;      /* swap space still available */
        __u16 procs;                    /* Number of current processes */
        __u16 pad;                      /* Explicit padding for m68k */
        __kernel_ulong_t totalhigh;     /* Total high memory size */
        __kernel_ulong_t freehigh;      /* Available high memory size */
        __u32 mem_unit;                 /* Memory unit size in bytes */
        char _f[20-2*sizeof(__kernel_ulong_t)-sizeof(__u32)];   /* Padding: libc5 uses this.. */
}


Memory used by buffers  这个不是很容易理解啊。还是看代码吧。

http://lxr.free-electrons.com/source/mm/page_alloc.c
 void si_meminfo(struct sysinfo *val)
 {
         val->totalram = totalram_pages;
         val->sharedram = global_page_state(NR_SHMEM);
         val->freeram = global_page_state(NR_FREE_PAGES);
         val->bufferram = nr_blockdev_pages();
         val->totalhigh = totalhigh_pages;
         val->freehigh = nr_free_highpages();
         val->mem_unit = PAGE_SIZE;
 }


看看nr_blockdev_pages 这个是什么东西

 long nr_blockdev_pages(void)
 {
         struct block_device *bdev;
         long ret = 0;
         spin_lock(&bdev_lock);
         list_for_each_entry(bdev, &all_bdevs, bd_list) {
                 ret += bdev->bd_inode->i_mapping->nrpages;
         }
         spin_unlock(&bdev_lock);
         return ret;
 }


遍历所有block deivce,统计所有的bd_inode->i_mapping使用页
看意思,buffer应该对应的是inode的缓存




 struct block_device {
         dev_t                   bd_dev;  /* not a kdev_t - it's a search key */
         int                     bd_openers;
         struct inode *          bd_inode;       /* will die */     这个是 块设备自己inode ,不是文件的。看注释好像没怎么打算使用啦?


struct inode {
   struct address_space    *i_mapping;              // 指的是自己的i_data
   struct address_space    i_data;                  // 每个inode都有自己独立的address_space    







http://lxr.free-electrons.com/source/fs/block_dev.
truncate_inode_pages
__sync_blockdev

struct block_device *bdget(dev_t dev)
   iget5_locked    //inode cache ,
        get_new_inode
            alloc_inode






http://lxr.free-electrons.com/source/fs/inode.c
int inode_init_always(struct super_block *sb, struct inode *inode)
     struct address_space *const mapping = &inode->i_data;
     inode->i_mapping = mapping;    /// inode 初始化时,i_mapping为自身的i_data



If the inode has metadata buffers attached to mapping->private_list then
try to remove them.
prune_icache()
   invalidate_mapping_pages




http://lxr.free-electrons.com/source/fs/buffer.c

 int inode_has_buffers(struct inode *inode)
 {
         return !list_empty(&inode->i_data.private_list);
 }


 static int sync_buffer(void *word)
 {
         struct block_device *bd;
         struct buffer_head *bh
                 = container_of(word, struct buffer_head, b_state);
 
         smp_mb();
         bd = bh->b_bdev;
         if (bd)
                 blk_run_address_space(bd->bd_inode->i_mapping);
         io_schedule();
         return 0;
 }


 void invalidate_bdev(struct block_device *bdev)
 {
         struct address_space *mapping = bdev->bd_inode->i_mapping;
 
         if (mapping->nrpages == 0)
                 return;
 
         invalidate_bh_lrus();
         invalidate_mapping_pages(mapping, 0, -1);
 }

void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)



 /*
 * Create the page-cache page that contains the requested block.
 *
 * This is user purely for blockdev mappings.
 */
 static struct page *
 grow_dev_page(struct block_device *bdev, sector_t block,
                pgoff_t index, int size)
    struct inode *inode = bdev->bd_inode;
         page = find_or_create_page(inode->i_mapping, index,
                 (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);
         // 这个page也是统计到NR_FILE_PAGE里面去的
    bh = alloc_page_buffers(page, size, 0);



grow_buffers(struct block_device *bdev, sector_t block, int size)
         page = grow_dev_page(bdev, block, index, size);






/*
 * fs/buffer.c contains helper functions for buffer-backed address space's
 * fsync functions.  A common requirement for buffer-based filesystems is
 * that certain data from the backing blockdev needs to be written out for
 * a successful fsync().  For example, ext2 indirect blocks need to be
 * written back and waited upon before fsync() returns.
 *
 * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
 * inode_has_buffers() and invalidate_inode_buffers() are provided for the
 * management of a list of dependent buffers at ->i_mapping->private_list.








/*
 * Invalidate any and all dirty buffers on a given inode.  We are
 * probably unmounting the fs, but that doesn't mean we have already
 * done a sync().  Just drop the buffers from the inode list.
 *
 * NOTE: we take the inode's blockdev's mapping's private_lock.  Which
 * assumes that all the buffers are against the blockdev.  Not true
 * for reiserfs.
 */
void invalidate_inode_buffers(struct inode *inode)
{
        if (inode_has_buffers(inode)) {
                struct address_space *mapping = &inode->i_data;
                struct list_head *list = &mapping->private_list;
                struct address_space *buffer_mapping = mapping->assoc_mapping;

                spin_lock(&buffer_mapping->private_lock);
                while (!list_empty(list))
                        __remove_assoc_queue(BH_ENTRY(list->next));
                spin_unlock(&buffer_mapping->private_lock);
        }
}
EXPORT_SYMBOL(invalidate_inode_buffers);





/*
 * Remove any clean buffers from the inode's buffer list.  This is called
 * when we're trying to free the inode itself.  Those buffers can pin it.
 *
 * Returns true if all buffers were removed.
 */
int remove_inode_buffers(struct inode *inode)
{
        int ret = 1;

        if (inode_has_buffers(inode)) {
                struct address_space *mapping = &inode->i_data;
                struct list_head *list = &mapping->private_list;
                struct address_space *buffer_mapping = mapping->assoc_mapping;

                spin_lock(&buffer_mapping->private_lock);
                while (!list_empty(list)) {
                        struct buffer_head *bh = BH_ENTRY(list->next);
                        if (buffer_dirty(bh)) {
                                ret = 0;
                                break;
                        }
                        __remove_assoc_queue(bh);
                }
                spin_unlock(&buffer_mapping->private_lock);
        }
        return ret;
}


 void mark_buffer_dirty(struct buffer_head *bh)










/*
 * Historically, a buffer_head was used to map a single block
 * within a page, and of course as the unit of I/O through the
 * filesystem and block layers.  Nowadays the basic I/O unit
 * is the bio, and buffer_heads are used for extracting block
 * mappings (via a get_block_t call), for tracking state within
 * a page (via a page_mapping) and for wrapping bio submission
 * for backward compatibility reasons (e.g. submit_bh).
 */
struct buffer_head {
        unsigned long b_state;          /* buffer state bitmap (see above) */
        struct buffer_head *b_this_page;/* circular list of page's buffers */
        struct page *b_page;            /* the page this bh is mapped to */

        sector_t b_blocknr;             /* start block number */
        size_t b_size;                  /* size of mapping */
        char *b_data;                   /* pointer to data within the page */

        struct block_device *b_bdev;
        bh_end_io_t *b_end_io;          /* I/O completion */
        void *b_private;                /* reserved for b_end_io */
        struct list_head b_assoc_buffers; /* associated with another mapping */
        struct address_space *b_assoc_map;      /* mapping this buffer is
                                                   associated with */
        atomic_t b_count;               /* users using this buffer_head */
};



submit_bh(write_op, bh);


buffer_head  是用来构造bio请求的 概念。  磁盘操作的提交依赖于这个buffer的信息。


generic_perform_write
  block_write_begin
    grab_cache_page_write_begin
       add_to_page_cache_lru
 




http://lxr.free-electrons.com/source/fs/fs-writeback.c
Write out an inode and its dirty pages
 __writeback_single_inode




static int blkdev_open(struct inode * inode, struct file * filp)
{
        struct block_device *bdev;

        /*
         * Preserve backwards compatibility and allow large file access
         * even if userspace doesn't ask for it explicitly. Some mkfs
         * binary needs it. We might want to drop this workaround
         * during an unstable branch.
         */
        filp->f_flags |= O_LARGEFILE;

        if (filp->f_flags & O_NDELAY)
                filp->f_mode |= FMODE_NDELAY;
        if (filp->f_flags & O_EXCL)
                filp->f_mode |= FMODE_EXCL;
        if ((filp->f_flags & O_ACCMODE) == 3)
                filp->f_mode |= FMODE_WRITE_IOCTL;

        bdev = bd_acquire(inode);
        if (bdev == NULL)
                return -ENOMEM;

        filp->f_mapping = bdev->bd_inode->i_mapping;                 //设备文件的 inode 对应 bdev的inode

        return blkdev_get(bdev, filp->f_mode, filp);
}

static const struct address_space_operations def_blk_aops = {
        .readpage       = blkdev_readpage,
        .writepage      = blkdev_writepage,
        .write_begin    = blkdev_write_begin,
        .write_end      = blkdev_write_end,
        .writepages     = generic_writepages,
        .releasepage    = blkdev_releasepage,
        .direct_IO      = blkdev_direct_IO,
        .is_dirty_writeback = buffer_check_dirty_writeback,
};
const struct file_operations def_blk_fops = {
        .open           = blkdev_open,
        .release        = blkdev_close,
        .llseek         = block_llseek,
        .read           = new_sync_read,
        .write          = new_sync_write,
        .read_iter      = blkdev_read_iter,
        .write_iter     = blkdev_write_iter,
        .mmap           = generic_file_mmap,
        .fsync          = blkdev_fsync,
        .unlocked_ioctl = block_ioctl,
#ifdef CONFIG_COMPAT
        .compat_ioctl   = compat_blkdev_ioctl,
#endif
        .splice_read    = generic_file_splice_read,
        .splice_write   = iter_file_splice_write,
};

new_sync_write
  -> blkdev_write_iter
    ->__generic_file_write_iter
      -> generic_perform_write
       -> blkdev_write_begin
         -> block_write_begin
           grab_cache_page_write_begin   // 这个
              pagecache_get_page
                add_to_page_cache_lru    完成
                page的cache的统计。这里的inode应该对应block device 的bd_inode


-----------------------
1106 static const struct super_operations ext4_sops = {
1107         .alloc_inode    = ext4_alloc_inode,
1108         .destroy_inode  = ext4_destroy_inode,
1109         .write_inode    = ext4_write_inode,
1110         .dirty_inode    = ext4_dirty_inode,
1111         .drop_inode     = ext4_drop_inode,
1112         .evict_inode    = ext4_evict_inode,
1113         .put_super      = ext4_put_super,
1114         .sync_fs        = ext4_sync_fs,
1115         .freeze_fs      = ext4_freeze,
1116         .unfreeze_fs    = ext4_unfreeze,
1117         .statfs         = ext4_statfs,
1118         .remount_fs     = ext4_remount,
1119         .show_options   = ext4_show_options,
1120 #ifdef CONFIG_QUOTA
1121         .quota_read     = ext4_quota_read,
1122         .quota_write    = ext4_quota_write,
1123 #endif
1124         .bdev_try_to_free_page = bdev_try_to_free_page,
1125 };

static struct inode *ext4_alloc_inode(struct super_block *sb)
{
        struct ext4_inode_info *ei;
        return &ei->vfs_inode;
------------------------------------------

直接block device,文件是用block device  inode,pagecache缓存应该也是使用这个inode的 mapping,所以,看到buffer增长的很快。

# cat /dev/sda1 > /dev/null
# free
             total       used       free     shared    buffers     cached
Mem:       1032548     432732     599816          0     235144     120516
-/+ buffers/cache:      77072     955476
Swap:       688124          0     688124
# cat /dev/sda1 > /dev/null
# free
             total       used       free     shared    buffers     cached
Mem:       1032548     538372     494176          0     339080     120516
-/+ buffers/cache:      78776     953772
Swap:       688124          0     688124



http://lxr.free-electrons.com/source/fs/drop_caches.c
sysctl_drop_caches  = 1  执行
drop_pagecache_sb   //  扫描 super_block的所有inode,执行下面释放 mapping page操作
  invalidate_mapping_pages

sysctl_drop_caches  = 2 执行
drop_slab 
  shrink_slab   // 因为inode和 dentries 都是从slab分配的吧

alloc_super   super block函数预先注册 slab shrinker操作对应的函数
  register_shrinker(&s->s_shrink); 
super_cache_scan   // 这里就是尝试删除super block的dentries 和inode的cache了
        freed = prune_dcache_sb(sb, dentries, sc->nid);
        freed += prune_icache_sb(sb, inodes, sc->nid);

To free pagecache:
echo 1 > /proc/sys/vm/drop_caches

To free dentries and inodes:
echo 2 > /proc/sys/vm/drop_caches

To free pagecache, dentries and inodes:
echo 3 > /proc/sys/vm/drop_cache


# echo 1 > /proc/sys/vm/drop_caches
#
# free
             total       used       free     shared    buffers     cached
Mem:       1032548      89080     943468          0         44      15544
-/+ buffers/cache:      73492     959056
Swap:       688124          0     688124


可以看到 buffer下降的很明显,buffer这个大部分还是 操作block device 产生的page cache。
而不是metadata(元数据)。

---------------------------------------





总结一下:
每个文件有自己inode 结构,每个inode结构有自己的address_space(i_data,  i_mapping) 成员。
用于管理读写文件时page cache缓存。 address_space记录了某个inode用到那些page。
然后也有函数可以方便得到 page <->  address_space <-> inode· 的相互对应关系。
文件系统里面page dirty 写磁盘等功能的实现都需要这几个结构来协同工作。

free命令的输出:
buffer 统计的是所有block device 对应的inode的 address_space的 page的数量。
cached 统计的系统所有用于文件系统的page 数量减去buffer的数量。

buffer 网上有人说是文件系统metadata元数据用到的page也能在这里。但看了一下没找到有这样的代码。
   可以确定的是直接访问block 设备产生的page cache是保存到block device的
   bd_inode 里面的。 所以直接读取/dev/sda1 这些可以导致buffer统计增大。
   bd_inode 看上去没什么大用处了,内核代码准备删掉了。

cached应该包含其他普通文件的读写产生的page cache内存吧。


dentries和inode缓存都是用普通slab管理器的kmalloc分配。有supper block统一管理。
占用的空间应该没有统计进上面buffer和cached里面,应该可以在/proc/slabinfo下面可以看到?

系统有统计所有page 的使用情况,比如全局的NR_FILE_PAGES项,就是用于文件缓存page的数量。
  评论这张
 
阅读(1244)| 评论(0)
推荐 转载

历史上的今天

评论

<#--最新日志,群博日志--> <#--推荐日志--> <#--引用记录--> <#--博主推荐--> <#--随机阅读--> <#--首页推荐--> <#--历史上的今天--> <#--被推荐日志--> <#--上一篇,下一篇--> <#-- 热度 --> <#-- 网易新闻广告 --> <#--右边模块结构--> <#--评论模块结构--> <#--引用模块结构--> <#--博主发起的投票-->
 
 
 
 
 
 
 
 
 
 
 
 
 
 

页脚

网易公司版权所有 ©1997-2017