forked from luck/tmp_suning_uos_patched
3d733633a6
This is the real meat of the entire series. It actually implements the tracking of the number of writers to a mount. However, it causes scalability problems because there can be hundreds of cpus doing open()/close() on files on the same mnt at the same time. Even an atomic_t in the mnt has massive scalaing problems because the cacheline gets so terribly contended. This uses a statically-allocated percpu variable. All want/drop operations are local to a cpu as long that cpu operates on the same mount, and there are no writer count imbalances. Writer count imbalances happen when a write is taken on one cpu, and released on another, like when an open/close pair is performed on two Upon a remount,ro request, all of the data from the percpu variables is collected (expensive, but very rare) and we determine if there are any outstanding writers to the mount. I've written a little benchmark to sit in a loop for a couple of seconds in several cpus in parallel doing open/write/close loops. http://sr71.net/~dave/linux/openbench.c The code in here is a a worst-possible case for this patch. It does opens on a _pair_ of files in two different mounts in parallel. This should cause my code to lose its "operate on the same mount" optimization completely. This worst-case scenario causes a 3% degredation in the benchmark. I could probably get rid of even this 3%, but it would be more complex than what I have here, and I think this is getting into acceptable territory. In practice, I expect writing more than 3 bytes to a file, as well as disk I/O to mask any effects that this has. (To get rid of that 3%, we could have an #defined number of mounts in the percpu variable. So, instead of a CPU getting operate only on percpu data when it accesses only one mount, it could stay on percpu data when it only accesses N or fewer mounts.) [AV] merged fix for __clear_mnt_mount() stepping on freed vfsmount Acked-by: Al Viro <viro@ZenIV.linux.org.uk> Signed-off-by: Christoph Hellwig <hch@infradead.org> Signed-off-by: Dave Hansen <haveblue@us.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
118 lines
3.5 KiB
C
118 lines
3.5 KiB
C
/*
|
|
*
|
|
* Definitions for mount interface. This describes the in the kernel build
|
|
* linkedlist with mounted filesystems.
|
|
*
|
|
* Author: Marco van Wieringen <mvw@planets.elm.net>
|
|
*
|
|
* Version: $Id: mount.h,v 2.0 1996/11/17 16:48:14 mvw Exp mvw $
|
|
*
|
|
*/
|
|
#ifndef _LINUX_MOUNT_H
|
|
#define _LINUX_MOUNT_H
|
|
#ifdef __KERNEL__
|
|
|
|
#include <linux/types.h>
|
|
#include <linux/list.h>
|
|
#include <linux/nodemask.h>
|
|
#include <linux/spinlock.h>
|
|
#include <asm/atomic.h>
|
|
|
|
struct super_block;
|
|
struct vfsmount;
|
|
struct dentry;
|
|
struct mnt_namespace;
|
|
|
|
#define MNT_NOSUID 0x01
|
|
#define MNT_NODEV 0x02
|
|
#define MNT_NOEXEC 0x04
|
|
#define MNT_NOATIME 0x08
|
|
#define MNT_NODIRATIME 0x10
|
|
#define MNT_RELATIME 0x20
|
|
|
|
#define MNT_SHRINKABLE 0x100
|
|
#define MNT_IMBALANCED_WRITE_COUNT 0x200 /* just for debugging */
|
|
|
|
#define MNT_SHARED 0x1000 /* if the vfsmount is a shared mount */
|
|
#define MNT_UNBINDABLE 0x2000 /* if the vfsmount is a unbindable mount */
|
|
#define MNT_PNODE_MASK 0x3000 /* propagation flag mask */
|
|
|
|
struct vfsmount {
|
|
struct list_head mnt_hash;
|
|
struct vfsmount *mnt_parent; /* fs we are mounted on */
|
|
struct dentry *mnt_mountpoint; /* dentry of mountpoint */
|
|
struct dentry *mnt_root; /* root of the mounted tree */
|
|
struct super_block *mnt_sb; /* pointer to superblock */
|
|
struct list_head mnt_mounts; /* list of children, anchored here */
|
|
struct list_head mnt_child; /* and going through their mnt_child */
|
|
int mnt_flags;
|
|
/* 4 bytes hole on 64bits arches */
|
|
char *mnt_devname; /* Name of device e.g. /dev/dsk/hda1 */
|
|
struct list_head mnt_list;
|
|
struct list_head mnt_expire; /* link in fs-specific expiry list */
|
|
struct list_head mnt_share; /* circular list of shared mounts */
|
|
struct list_head mnt_slave_list;/* list of slave mounts */
|
|
struct list_head mnt_slave; /* slave list entry */
|
|
struct vfsmount *mnt_master; /* slave is on master->mnt_slave_list */
|
|
struct mnt_namespace *mnt_ns; /* containing namespace */
|
|
/*
|
|
* We put mnt_count & mnt_expiry_mark at the end of struct vfsmount
|
|
* to let these frequently modified fields in a separate cache line
|
|
* (so that reads of mnt_flags wont ping-pong on SMP machines)
|
|
*/
|
|
atomic_t mnt_count;
|
|
int mnt_expiry_mark; /* true if marked for expiry */
|
|
int mnt_pinned;
|
|
int mnt_ghosts;
|
|
/*
|
|
* This value is not stable unless all of the mnt_writers[] spinlocks
|
|
* are held, and all mnt_writer[]s on this mount have 0 as their ->count
|
|
*/
|
|
atomic_t __mnt_writers;
|
|
};
|
|
|
|
static inline struct vfsmount *mntget(struct vfsmount *mnt)
|
|
{
|
|
if (mnt)
|
|
atomic_inc(&mnt->mnt_count);
|
|
return mnt;
|
|
}
|
|
|
|
extern int mnt_want_write(struct vfsmount *mnt);
|
|
extern void mnt_drop_write(struct vfsmount *mnt);
|
|
extern void mntput_no_expire(struct vfsmount *mnt);
|
|
extern void mnt_pin(struct vfsmount *mnt);
|
|
extern void mnt_unpin(struct vfsmount *mnt);
|
|
extern int __mnt_is_readonly(struct vfsmount *mnt);
|
|
|
|
static inline void mntput(struct vfsmount *mnt)
|
|
{
|
|
if (mnt) {
|
|
mnt->mnt_expiry_mark = 0;
|
|
mntput_no_expire(mnt);
|
|
}
|
|
}
|
|
|
|
extern void free_vfsmnt(struct vfsmount *mnt);
|
|
extern struct vfsmount *alloc_vfsmnt(const char *name);
|
|
extern struct vfsmount *do_kern_mount(const char *fstype, int flags,
|
|
const char *name, void *data);
|
|
|
|
struct file_system_type;
|
|
extern struct vfsmount *vfs_kern_mount(struct file_system_type *type,
|
|
int flags, const char *name,
|
|
void *data);
|
|
|
|
struct nameidata;
|
|
|
|
extern int do_add_mount(struct vfsmount *newmnt, struct nameidata *nd,
|
|
int mnt_flags, struct list_head *fslist);
|
|
|
|
extern void mark_mounts_for_expiry(struct list_head *mounts);
|
|
|
|
extern spinlock_t vfsmount_lock;
|
|
extern dev_t name_to_dev_t(char *name);
|
|
|
|
#endif
|
|
#endif /* _LINUX_MOUNT_H */
|