kernel_optimize_test/fs/proc/root.c
Alexey Dobriyan 2d3a4e3666 proc: fix ->open'less usage due to ->proc_fops flip
Typical PDE creation code looks like:

	pde = create_proc_entry("foo", 0, NULL);
	if (pde)
		pde->proc_fops = &foo_proc_fops;

Notice that PDE is first created, only then ->proc_fops is set up to
final value. This is a problem because right after creation
a) PDE is fully visible in /proc , and
b) ->proc_fops are proc_file_operations which do not have ->open callback. So, it's
   possible to ->read without ->open (see one class of oopses below).

The fix is new API called proc_create() which makes sure ->proc_fops are
set up before gluing PDE to main tree. Typical new code looks like:

	pde = proc_create("foo", 0, NULL, &foo_proc_fops);
	if (!pde)
		return -ENOMEM;

Fix most networking users for a start.

In the long run, create_proc_entry() for regular files will go.

BUG: unable to handle kernel NULL pointer dereference at virtual address 00000024
printing eip: c1188c1b *pdpt = 000000002929e001 *pde = 0000000000000000
Oops: 0002 [#1] PREEMPT SMP DEBUG_PAGEALLOC
last sysfs file: /sys/block/sda/sda1/dev
Modules linked in: foo af_packet ipv6 cpufreq_ondemand loop serio_raw psmouse k8temp hwmon sr_mod cdrom

Pid: 24679, comm: cat Not tainted (2.6.24-rc3-mm1 #2)
EIP: 0060:[<c1188c1b>] EFLAGS: 00210002 CPU: 0
EIP is at mutex_lock_nested+0x75/0x25d
EAX: 000006fe EBX: fffffffb ECX: 00001000 EDX: e9340570
ESI: 00000020 EDI: 00200246 EBP: e9340570 ESP: e8ea1ef8
 DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068
Process cat (pid: 24679, ti=E8EA1000 task=E9340570 task.ti=E8EA1000)
Stack: 00000000 c106f7ce e8ee05b4 00000000 00000001 458003d0 f6fb6f20 fffffffb
       00000000 c106f7aa 00001000 c106f7ce 08ae9000 f6db53f0 00000020 00200246
       00000000 00000002 00000000 00200246 00200246 e8ee05a0 fffffffb e8ee0550
Call Trace:
 [<c106f7ce>] seq_read+0x24/0x28a
 [<c106f7aa>] seq_read+0x0/0x28a
 [<c106f7ce>] seq_read+0x24/0x28a
 [<c106f7aa>] seq_read+0x0/0x28a
 [<c10818b8>] proc_reg_read+0x60/0x73
 [<c1081858>] proc_reg_read+0x0/0x73
 [<c105a34f>] vfs_read+0x6c/0x8b
 [<c105a6f3>] sys_read+0x3c/0x63
 [<c10025f2>] sysenter_past_esp+0x5f/0xa5
 [<c10697a7>] destroy_inode+0x24/0x33
 =======================
INFO: lockdep is turned off.
Code: 75 21 68 e1 1a 19 c1 68 87 00 00 00 68 b8 e8 1f c1 68 25 73 1f c1 e8 84 06 e9 ff e8 52 b8 e7 ff 83 c4 10 9c 5f fa e8 28 89 ea ff <f0> fe 4e 04 79 0a f3 90 80 7e 04 00 7e f8 eb f0 39 76 34 74 33
EIP: [<c1188c1b>] mutex_lock_nested+0x75/0x25d SS:ESP 0068:e8ea1ef8

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Alexey Dobriyan <adobriyan@sw.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-02-08 09:22:24 -08:00

241 lines
5.2 KiB
C

/*
* linux/fs/proc/root.c
*
* Copyright (C) 1991, 1992 Linus Torvalds
*
* proc root directory handling functions
*/
#include <asm/uaccess.h>
#include <linux/errno.h>
#include <linux/time.h>
#include <linux/proc_fs.h>
#include <linux/stat.h>
#include <linux/init.h>
#include <linux/sched.h>
#include <linux/module.h>
#include <linux/bitops.h>
#include <linux/smp_lock.h>
#include <linux/mount.h>
#include <linux/pid_namespace.h>
#include "internal.h"
struct proc_dir_entry *proc_bus, *proc_root_fs, *proc_root_driver;
static int proc_test_super(struct super_block *sb, void *data)
{
return sb->s_fs_info == data;
}
static int proc_set_super(struct super_block *sb, void *data)
{
struct pid_namespace *ns;
ns = (struct pid_namespace *)data;
sb->s_fs_info = get_pid_ns(ns);
return set_anon_super(sb, NULL);
}
static int proc_get_sb(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data, struct vfsmount *mnt)
{
int err;
struct super_block *sb;
struct pid_namespace *ns;
struct proc_inode *ei;
if (proc_mnt) {
/* Seed the root directory with a pid so it doesn't need
* to be special in base.c. I would do this earlier but
* the only task alive when /proc is mounted the first time
* is the init_task and it doesn't have any pids.
*/
ei = PROC_I(proc_mnt->mnt_sb->s_root->d_inode);
if (!ei->pid)
ei->pid = find_get_pid(1);
}
if (flags & MS_KERNMOUNT)
ns = (struct pid_namespace *)data;
else
ns = current->nsproxy->pid_ns;
sb = sget(fs_type, proc_test_super, proc_set_super, ns);
if (IS_ERR(sb))
return PTR_ERR(sb);
if (!sb->s_root) {
sb->s_flags = flags;
err = proc_fill_super(sb);
if (err) {
up_write(&sb->s_umount);
deactivate_super(sb);
return err;
}
ei = PROC_I(sb->s_root->d_inode);
if (!ei->pid) {
rcu_read_lock();
ei->pid = get_pid(find_pid_ns(1, ns));
rcu_read_unlock();
}
sb->s_flags |= MS_ACTIVE;
ns->proc_mnt = mnt;
}
return simple_set_mnt(mnt, sb);
}
static void proc_kill_sb(struct super_block *sb)
{
struct pid_namespace *ns;
ns = (struct pid_namespace *)sb->s_fs_info;
kill_anon_super(sb);
put_pid_ns(ns);
}
static struct file_system_type proc_fs_type = {
.name = "proc",
.get_sb = proc_get_sb,
.kill_sb = proc_kill_sb,
};
void __init proc_root_init(void)
{
int err = proc_init_inodecache();
if (err)
return;
err = register_filesystem(&proc_fs_type);
if (err)
return;
proc_mnt = kern_mount_data(&proc_fs_type, &init_pid_ns);
err = PTR_ERR(proc_mnt);
if (IS_ERR(proc_mnt)) {
unregister_filesystem(&proc_fs_type);
return;
}
proc_misc_init();
proc_net_init();
#ifdef CONFIG_SYSVIPC
proc_mkdir("sysvipc", NULL);
#endif
proc_root_fs = proc_mkdir("fs", NULL);
proc_root_driver = proc_mkdir("driver", NULL);
proc_mkdir("fs/nfsd", NULL); /* somewhere for the nfsd filesystem to be mounted */
#if defined(CONFIG_SUN_OPENPROMFS) || defined(CONFIG_SUN_OPENPROMFS_MODULE)
/* just give it a mountpoint */
proc_mkdir("openprom", NULL);
#endif
proc_tty_init();
#ifdef CONFIG_PROC_DEVICETREE
proc_device_tree_init();
#endif
proc_bus = proc_mkdir("bus", NULL);
proc_sys_init();
}
static int proc_root_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat
)
{
generic_fillattr(dentry->d_inode, stat);
stat->nlink = proc_root.nlink + nr_processes();
return 0;
}
static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, struct nameidata *nd)
{
if (!proc_lookup(dir, dentry, nd)) {
return NULL;
}
return proc_pid_lookup(dir, dentry, nd);
}
static int proc_root_readdir(struct file * filp,
void * dirent, filldir_t filldir)
{
unsigned int nr = filp->f_pos;
int ret;
lock_kernel();
if (nr < FIRST_PROCESS_ENTRY) {
int error = proc_readdir(filp, dirent, filldir);
if (error <= 0) {
unlock_kernel();
return error;
}
filp->f_pos = FIRST_PROCESS_ENTRY;
}
unlock_kernel();
ret = proc_pid_readdir(filp, dirent, filldir);
return ret;
}
/*
* The root /proc directory is special, as it has the
* <pid> directories. Thus we don't use the generic
* directory handling functions for that..
*/
static const struct file_operations proc_root_operations = {
.read = generic_read_dir,
.readdir = proc_root_readdir,
};
/*
* proc root can do almost nothing..
*/
static const struct inode_operations proc_root_inode_operations = {
.lookup = proc_root_lookup,
.getattr = proc_root_getattr,
};
/*
* This is the root "inode" in the /proc tree..
*/
struct proc_dir_entry proc_root = {
.low_ino = PROC_ROOT_INO,
.namelen = 5,
.name = "/proc",
.mode = S_IFDIR | S_IRUGO | S_IXUGO,
.nlink = 2,
.count = ATOMIC_INIT(1),
.proc_iops = &proc_root_inode_operations,
.proc_fops = &proc_root_operations,
.parent = &proc_root,
};
int pid_ns_prepare_proc(struct pid_namespace *ns)
{
struct vfsmount *mnt;
mnt = kern_mount_data(&proc_fs_type, ns);
if (IS_ERR(mnt))
return PTR_ERR(mnt);
return 0;
}
void pid_ns_release_proc(struct pid_namespace *ns)
{
mntput(ns->proc_mnt);
}
EXPORT_SYMBOL(proc_symlink);
EXPORT_SYMBOL(proc_mkdir);
EXPORT_SYMBOL(create_proc_entry);
EXPORT_SYMBOL(proc_create);
EXPORT_SYMBOL(remove_proc_entry);
EXPORT_SYMBOL(proc_root);
EXPORT_SYMBOL(proc_root_fs);
EXPORT_SYMBOL(proc_bus);
EXPORT_SYMBOL(proc_root_driver);