kernel_optimize_test/block/bsg.c

1120 lines
23 KiB
C
Raw Normal View History

/*
* bsg.c - block layer implementation of the sg v4 interface
*
* Copyright (C) 2004 Jens Axboe <axboe@suse.de> SUSE Labs
* Copyright (C) 2004 Peter M. Jones <pjones@redhat.com>
*
* This file is subject to the terms and conditions of the GNU General Public
* License version 2. See the file "COPYING" in the main directory of this
* archive for more details.
*
*/
#include <linux/module.h>
#include <linux/init.h>
#include <linux/file.h>
#include <linux/blkdev.h>
#include <linux/poll.h>
#include <linux/cdev.h>
#include <linux/jiffies.h>
#include <linux/percpu.h>
#include <linux/uio.h>
#include <linux/idr.h>
#include <linux/bsg.h>
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h percpu.h is included by sched.h and module.h and thus ends up being included when building most .c files. percpu.h includes slab.h which in turn includes gfp.h making everything defined by the two files universally available and complicating inclusion dependencies. percpu.h -> slab.h dependency is about to be removed. Prepare for this change by updating users of gfp and slab facilities include those headers directly instead of assuming availability. As this conversion needs to touch large number of source files, the following script is used as the basis of conversion. http://userweb.kernel.org/~tj/misc/slabh-sweep.py The script does the followings. * Scan files for gfp and slab usages and update includes such that only the necessary includes are there. ie. if only gfp is used, gfp.h, if slab is used, slab.h. * When the script inserts a new include, it looks at the include blocks and try to put the new include such that its order conforms to its surrounding. It's put in the include block which contains core kernel includes, in the same order that the rest are ordered - alphabetical, Christmas tree, rev-Xmas-tree or at the end if there doesn't seem to be any matching order. * If the script can't find a place to put a new include (mostly because the file doesn't have fitting include block), it prints out an error message indicating which .h file needs to be added to the file. The conversion was done in the following steps. 1. The initial automatic conversion of all .c files updated slightly over 4000 files, deleting around 700 includes and adding ~480 gfp.h and ~3000 slab.h inclusions. The script emitted errors for ~400 files. 2. Each error was manually checked. Some didn't need the inclusion, some needed manual addition while adding it to implementation .h or embedding .c file was more appropriate for others. This step added inclusions to around 150 files. 3. The script was run again and the output was compared to the edits from #2 to make sure no file was left behind. 4. Several build tests were done and a couple of problems were fixed. e.g. lib/decompress_*.c used malloc/free() wrappers around slab APIs requiring slab.h to be added manually. 5. The script was run on all .h files but without automatically editing them as sprinkling gfp.h and slab.h inclusions around .h files could easily lead to inclusion dependency hell. Most gfp.h inclusion directives were ignored as stuff from gfp.h was usually wildly available and often used in preprocessor macros. Each slab.h inclusion directive was examined and added manually as necessary. 6. percpu.h was updated not to include slab.h. 7. Build test were done on the following configurations and failures were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my distributed build env didn't work with gcov compiles) and a few more options had to be turned off depending on archs to make things build (like ipr on powerpc/64 which failed due to missing writeq). * x86 and x86_64 UP and SMP allmodconfig and a custom test config. * powerpc and powerpc64 SMP allmodconfig * sparc and sparc64 SMP allmodconfig * ia64 SMP allmodconfig * s390 SMP allmodconfig * alpha SMP allmodconfig * um on x86_64 SMP allmodconfig 8. percpu.h modifications were reverted so that it could be applied as a separate patch and serve as bisection point. Given the fact that I had only a couple of failures from tests on step 6, I'm fairly confident about the coverage of this conversion patch. If there is a breakage, it's likely to be something in one of the arch headers which should be easily discoverable easily on most builds of the specific arch. Signed-off-by: Tejun Heo <tj@kernel.org> Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 16:04:11 +08:00
#include <linux/slab.h>
#include <scsi/scsi.h>
#include <scsi/scsi_ioctl.h>
#include <scsi/scsi_cmnd.h>
#include <scsi/scsi_device.h>
#include <scsi/scsi_driver.h>
#include <scsi/sg.h>
#define BSG_DESCRIPTION "Block layer SCSI generic (bsg) driver"
#define BSG_VERSION "0.4"
struct bsg_device {
struct request_queue *queue;
spinlock_t lock;
struct list_head busy_list;
struct list_head done_list;
struct hlist_node dev_list;
atomic_t ref_count;
int queued_cmds;
int done_cmds;
wait_queue_head_t wq_done;
wait_queue_head_t wq_free;
char name[20];
int max_queue;
unsigned long flags;
};
enum {
BSG_F_BLOCK = 1,
};
#define BSG_DEFAULT_CMDS 64
#define BSG_MAX_DEVS 32768
#undef BSG_DEBUG
#ifdef BSG_DEBUG
#define dprintk(fmt, args...) printk(KERN_ERR "%s: " fmt, __func__, ##args)
#else
#define dprintk(fmt, args...)
#endif
static DEFINE_MUTEX(bsg_mutex);
static DEFINE_IDR(bsg_minor_idr);
#define BSG_LIST_ARRAY_SIZE 8
static struct hlist_head bsg_device_list[BSG_LIST_ARRAY_SIZE];
static struct class *bsg_class;
static int bsg_major;
static struct kmem_cache *bsg_cmd_cachep;
/*
* our internal command type
*/
struct bsg_command {
struct bsg_device *bd;
struct list_head list;
struct request *rq;
struct bio *bio;
struct bio *bidi_bio;
int err;
struct sg_io_v4 hdr;
char sense[SCSI_SENSE_BUFFERSIZE];
};
static void bsg_free_command(struct bsg_command *bc)
{
struct bsg_device *bd = bc->bd;
unsigned long flags;
kmem_cache_free(bsg_cmd_cachep, bc);
spin_lock_irqsave(&bd->lock, flags);
bd->queued_cmds--;
spin_unlock_irqrestore(&bd->lock, flags);
wake_up(&bd->wq_free);
}
static struct bsg_command *bsg_alloc_command(struct bsg_device *bd)
{
struct bsg_command *bc = ERR_PTR(-EINVAL);
spin_lock_irq(&bd->lock);
if (bd->queued_cmds >= bd->max_queue)
goto out;
bd->queued_cmds++;
spin_unlock_irq(&bd->lock);
bc = kmem_cache_zalloc(bsg_cmd_cachep, GFP_KERNEL);
if (unlikely(!bc)) {
spin_lock_irq(&bd->lock);
bd->queued_cmds--;
bc = ERR_PTR(-ENOMEM);
goto out;
}
bc->bd = bd;
INIT_LIST_HEAD(&bc->list);
dprintk("%s: returning free cmd %p\n", bd->name, bc);
return bc;
out:
spin_unlock_irq(&bd->lock);
return bc;
}
static inline struct hlist_head *bsg_dev_idx_hash(int index)
{
return &bsg_device_list[index & (BSG_LIST_ARRAY_SIZE - 1)];
}
static int bsg_io_schedule(struct bsg_device *bd)
{
DEFINE_WAIT(wait);
int ret = 0;
spin_lock_irq(&bd->lock);
BUG_ON(bd->done_cmds > bd->queued_cmds);
/*
* -ENOSPC or -ENODATA? I'm going for -ENODATA, meaning "I have no
* work to do", even though we return -ENOSPC after this same test
* during bsg_write() -- there, it means our buffer can't have more
* bsg_commands added to it, thus has no space left.
*/
if (bd->done_cmds == bd->queued_cmds) {
ret = -ENODATA;
goto unlock;
}
if (!test_bit(BSG_F_BLOCK, &bd->flags)) {
ret = -EAGAIN;
goto unlock;
}
prepare_to_wait(&bd->wq_done, &wait, TASK_UNINTERRUPTIBLE);
spin_unlock_irq(&bd->lock);
io_schedule();
finish_wait(&bd->wq_done, &wait);
return ret;
unlock:
spin_unlock_irq(&bd->lock);
return ret;
}
static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq,
struct sg_io_v4 *hdr, struct bsg_device *bd,
fmode_t has_write_perm)
{
if (hdr->request_len > BLK_MAX_CDB) {
rq->cmd = kzalloc(hdr->request_len, GFP_KERNEL);
if (!rq->cmd)
return -ENOMEM;
}
if (copy_from_user(rq->cmd, (void *)(unsigned long)hdr->request,
hdr->request_len))
return -EFAULT;
if (hdr->subprotocol == BSG_SUB_PROTOCOL_SCSI_CMD) {
if (blk_verify_command(rq->cmd, has_write_perm))
return -EPERM;
} else if (!capable(CAP_SYS_RAWIO))
return -EPERM;
/*
* fill in request structure
*/
rq->cmd_len = hdr->request_len;
rq->cmd_type = REQ_TYPE_BLOCK_PC;
rq->timeout = msecs_to_jiffies(hdr->timeout);
if (!rq->timeout)
rq->timeout = q->sg_timeout;
if (!rq->timeout)
rq->timeout = BLK_DEFAULT_SG_TIMEOUT;
if (rq->timeout < BLK_MIN_SG_TIMEOUT)
rq->timeout = BLK_MIN_SG_TIMEOUT;
return 0;
}
/*
* Check if sg_io_v4 from user is allowed and valid
*/
static int
bsg_validate_sgv4_hdr(struct request_queue *q, struct sg_io_v4 *hdr, int *rw)
{
int ret = 0;
if (hdr->guard != 'Q')
return -EINVAL;
switch (hdr->protocol) {
case BSG_PROTOCOL_SCSI:
switch (hdr->subprotocol) {
case BSG_SUB_PROTOCOL_SCSI_CMD:
case BSG_SUB_PROTOCOL_SCSI_TRANSPORT:
break;
default:
ret = -EINVAL;
}
break;
default:
ret = -EINVAL;
}
*rw = hdr->dout_xfer_len ? WRITE : READ;
return ret;
}
/*
* map sg_io_v4 to a request.
*/
static struct request *
bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm,
u8 *sense)
{
struct request_queue *q = bd->queue;
struct request *rq, *next_rq = NULL;
int ret, rw;
unsigned int dxfer_len;
void *dxferp = NULL;
dprintk("map hdr %llx/%u %llx/%u\n", (unsigned long long) hdr->dout_xferp,
hdr->dout_xfer_len, (unsigned long long) hdr->din_xferp,
hdr->din_xfer_len);
ret = bsg_validate_sgv4_hdr(q, hdr, &rw);
if (ret)
return ERR_PTR(ret);
/*
* map scatter-gather elements separately and string them to request
*/
rq = blk_get_request(q, rw, GFP_KERNEL);
if (!rq)
return ERR_PTR(-ENOMEM);
ret = blk_fill_sgv4_hdr_rq(q, rq, hdr, bd, has_write_perm);
if (ret)
goto out;
if (rw == WRITE && hdr->din_xfer_len) {
if (!test_bit(QUEUE_FLAG_BIDI, &q->queue_flags)) {
ret = -EOPNOTSUPP;
goto out;
}
next_rq = blk_get_request(q, READ, GFP_KERNEL);
if (!next_rq) {
ret = -ENOMEM;
goto out;
}
rq->next_rq = next_rq;
next_rq->cmd_type = rq->cmd_type;
dxferp = (void*)(unsigned long)hdr->din_xferp;
ret = blk_rq_map_user(q, next_rq, NULL, dxferp,
hdr->din_xfer_len, GFP_KERNEL);
if (ret)
goto out;
}
if (hdr->dout_xfer_len) {
dxfer_len = hdr->dout_xfer_len;
dxferp = (void*)(unsigned long)hdr->dout_xferp;
} else if (hdr->din_xfer_len) {
dxfer_len = hdr->din_xfer_len;
dxferp = (void*)(unsigned long)hdr->din_xferp;
} else
dxfer_len = 0;
if (dxfer_len) {
ret = blk_rq_map_user(q, rq, NULL, dxferp, dxfer_len,
GFP_KERNEL);
if (ret)
goto out;
}
rq->sense = sense;
rq->sense_len = 0;
return rq;
out:
if (rq->cmd != rq->__cmd)
kfree(rq->cmd);
blk_put_request(rq);
if (next_rq) {
blk_rq_unmap_user(next_rq->bio);
blk_put_request(next_rq);
}
return ERR_PTR(ret);
}
/*
* async completion call-back from the block layer, when scsi/ide/whatever
* calls end_that_request_last() on a request
*/
static void bsg_rq_end_io(struct request *rq, int uptodate)
{
struct bsg_command *bc = rq->end_io_data;
struct bsg_device *bd = bc->bd;
unsigned long flags;
dprintk("%s: finished rq %p bc %p, bio %p stat %d\n",
bd->name, rq, bc, bc->bio, uptodate);
bc->hdr.duration = jiffies_to_msecs(jiffies - bc->hdr.duration);
spin_lock_irqsave(&bd->lock, flags);
list_move_tail(&bc->list, &bd->done_list);
bd->done_cmds++;
spin_unlock_irqrestore(&bd->lock, flags);
wake_up(&bd->wq_done);
}
/*
* do final setup of a 'bc' and submit the matching 'rq' to the block
* layer for io
*/
static void bsg_add_command(struct bsg_device *bd, struct request_queue *q,
struct bsg_command *bc, struct request *rq)
{
int at_head = (0 == (bc->hdr.flags & BSG_FLAG_Q_AT_TAIL));
/*
* add bc command to busy queue and submit rq for io
*/
bc->rq = rq;
bc->bio = rq->bio;
if (rq->next_rq)
bc->bidi_bio = rq->next_rq->bio;
bc->hdr.duration = jiffies;
spin_lock_irq(&bd->lock);
list_add_tail(&bc->list, &bd->busy_list);
spin_unlock_irq(&bd->lock);
dprintk("%s: queueing rq %p, bc %p\n", bd->name, rq, bc);
rq->end_io_data = bc;
blk_execute_rq_nowait(q, NULL, rq, at_head, bsg_rq_end_io);
}
static struct bsg_command *bsg_next_done_cmd(struct bsg_device *bd)
{
struct bsg_command *bc = NULL;
spin_lock_irq(&bd->lock);
if (bd->done_cmds) {
bc = list_first_entry(&bd->done_list, struct bsg_command, list);
list_del(&bc->list);
bd->done_cmds--;
}
spin_unlock_irq(&bd->lock);
return bc;
}
/*
* Get a finished command from the done list
*/
static struct bsg_command *bsg_get_done_cmd(struct bsg_device *bd)
{
struct bsg_command *bc;
int ret;
do {
bc = bsg_next_done_cmd(bd);
if (bc)
break;
if (!test_bit(BSG_F_BLOCK, &bd->flags)) {
bc = ERR_PTR(-EAGAIN);
break;
}
ret = wait_event_interruptible(bd->wq_done, bd->done_cmds);
if (ret) {
bc = ERR_PTR(-ERESTARTSYS);
break;
}
} while (1);
dprintk("%s: returning done %p\n", bd->name, bc);
return bc;
}
static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr,
struct bio *bio, struct bio *bidi_bio)
{
int ret = 0;
dprintk("rq %p bio %p 0x%x\n", rq, bio, rq->errors);
/*
* fill in all the output members
*/
hdr->device_status = rq->errors & 0xff;
hdr->transport_status = host_byte(rq->errors);
hdr->driver_status = driver_byte(rq->errors);
hdr->info = 0;
if (hdr->device_status || hdr->transport_status || hdr->driver_status)
hdr->info |= SG_INFO_CHECK;
hdr->response_len = 0;
if (rq->sense_len && hdr->response) {
int len = min_t(unsigned int, hdr->max_response_len,
rq->sense_len);
ret = copy_to_user((void*)(unsigned long)hdr->response,
rq->sense, len);
if (!ret)
hdr->response_len = len;
else
ret = -EFAULT;
}
if (rq->next_rq) {
block: add rq->resid_len rq->data_len served two purposes - the length of data buffer on issue and the residual count on completion. This duality creates some headaches. First of all, block layer and low level drivers can't really determine what rq->data_len contains while a request is executing. It could be the total request length or it coulde be anything else one of the lower layers is using to keep track of residual count. This complicates things because blk_rq_bytes() and thus [__]blk_end_request_all() relies on rq->data_len for PC commands. Drivers which want to report residual count should first cache the total request length, update rq->data_len and then complete the request with the cached data length. Secondly, it makes requests default to reporting full residual count, ie. reporting that no data transfer occurred. The residual count is an exception not the norm; however, the driver should clear rq->data_len to zero to signify the normal cases while leaving it alone means no data transfer occurred at all. This reverse default behavior complicates code unnecessarily and renders block PC on some drivers (ide-tape/floppy) unuseable. This patch adds rq->resid_len which is used only for residual count. While at it, remove now unnecessasry blk_rq_bytes() caching in ide_pc_intr() as rq->data_len is not changed anymore. Boaz : spotted missing conversion in osd Sergei : spotted too early conversion to blk_rq_bytes() in ide-tape [ Impact: cleanup residual count handling, report 0 resid by default ] Signed-off-by: Tejun Heo <tj@kernel.org> Cc: James Bottomley <James.Bottomley@HansenPartnership.com> Cc: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com> Cc: Borislav Petkov <petkovbb@googlemail.com> Cc: Sergei Shtylyov <sshtylyov@ru.mvista.com> Cc: Mike Miller <mike.miller@hp.com> Cc: Eric Moore <Eric.Moore@lsi.com> Cc: Alan Stern <stern@rowland.harvard.edu> Cc: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp> Cc: Doug Gilbert <dgilbert@interlog.com> Cc: Mike Miller <mike.miller@hp.com> Cc: Eric Moore <Eric.Moore@lsi.com> Cc: Darrick J. Wong <djwong@us.ibm.com> Cc: Pete Zaitcev <zaitcev@redhat.com> Cc: Boaz Harrosh <bharrosh@panasas.com> Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
2009-05-07 21:24:37 +08:00
hdr->dout_resid = rq->resid_len;
hdr->din_resid = rq->next_rq->resid_len;
blk_rq_unmap_user(bidi_bio);
blk_put_request(rq->next_rq);
} else if (rq_data_dir(rq) == READ)
block: add rq->resid_len rq->data_len served two purposes - the length of data buffer on issue and the residual count on completion. This duality creates some headaches. First of all, block layer and low level drivers can't really determine what rq->data_len contains while a request is executing. It could be the total request length or it coulde be anything else one of the lower layers is using to keep track of residual count. This complicates things because blk_rq_bytes() and thus [__]blk_end_request_all() relies on rq->data_len for PC commands. Drivers which want to report residual count should first cache the total request length, update rq->data_len and then complete the request with the cached data length. Secondly, it makes requests default to reporting full residual count, ie. reporting that no data transfer occurred. The residual count is an exception not the norm; however, the driver should clear rq->data_len to zero to signify the normal cases while leaving it alone means no data transfer occurred at all. This reverse default behavior complicates code unnecessarily and renders block PC on some drivers (ide-tape/floppy) unuseable. This patch adds rq->resid_len which is used only for residual count. While at it, remove now unnecessasry blk_rq_bytes() caching in ide_pc_intr() as rq->data_len is not changed anymore. Boaz : spotted missing conversion in osd Sergei : spotted too early conversion to blk_rq_bytes() in ide-tape [ Impact: cleanup residual count handling, report 0 resid by default ] Signed-off-by: Tejun Heo <tj@kernel.org> Cc: James Bottomley <James.Bottomley@HansenPartnership.com> Cc: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com> Cc: Borislav Petkov <petkovbb@googlemail.com> Cc: Sergei Shtylyov <sshtylyov@ru.mvista.com> Cc: Mike Miller <mike.miller@hp.com> Cc: Eric Moore <Eric.Moore@lsi.com> Cc: Alan Stern <stern@rowland.harvard.edu> Cc: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp> Cc: Doug Gilbert <dgilbert@interlog.com> Cc: Mike Miller <mike.miller@hp.com> Cc: Eric Moore <Eric.Moore@lsi.com> Cc: Darrick J. Wong <djwong@us.ibm.com> Cc: Pete Zaitcev <zaitcev@redhat.com> Cc: Boaz Harrosh <bharrosh@panasas.com> Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
2009-05-07 21:24:37 +08:00
hdr->din_resid = rq->resid_len;
else
block: add rq->resid_len rq->data_len served two purposes - the length of data buffer on issue and the residual count on completion. This duality creates some headaches. First of all, block layer and low level drivers can't really determine what rq->data_len contains while a request is executing. It could be the total request length or it coulde be anything else one of the lower layers is using to keep track of residual count. This complicates things because blk_rq_bytes() and thus [__]blk_end_request_all() relies on rq->data_len for PC commands. Drivers which want to report residual count should first cache the total request length, update rq->data_len and then complete the request with the cached data length. Secondly, it makes requests default to reporting full residual count, ie. reporting that no data transfer occurred. The residual count is an exception not the norm; however, the driver should clear rq->data_len to zero to signify the normal cases while leaving it alone means no data transfer occurred at all. This reverse default behavior complicates code unnecessarily and renders block PC on some drivers (ide-tape/floppy) unuseable. This patch adds rq->resid_len which is used only for residual count. While at it, remove now unnecessasry blk_rq_bytes() caching in ide_pc_intr() as rq->data_len is not changed anymore. Boaz : spotted missing conversion in osd Sergei : spotted too early conversion to blk_rq_bytes() in ide-tape [ Impact: cleanup residual count handling, report 0 resid by default ] Signed-off-by: Tejun Heo <tj@kernel.org> Cc: James Bottomley <James.Bottomley@HansenPartnership.com> Cc: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com> Cc: Borislav Petkov <petkovbb@googlemail.com> Cc: Sergei Shtylyov <sshtylyov@ru.mvista.com> Cc: Mike Miller <mike.miller@hp.com> Cc: Eric Moore <Eric.Moore@lsi.com> Cc: Alan Stern <stern@rowland.harvard.edu> Cc: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp> Cc: Doug Gilbert <dgilbert@interlog.com> Cc: Mike Miller <mike.miller@hp.com> Cc: Eric Moore <Eric.Moore@lsi.com> Cc: Darrick J. Wong <djwong@us.ibm.com> Cc: Pete Zaitcev <zaitcev@redhat.com> Cc: Boaz Harrosh <bharrosh@panasas.com> Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
2009-05-07 21:24:37 +08:00
hdr->dout_resid = rq->resid_len;
/*
* If the request generated a negative error number, return it
* (providing we aren't already returning an error); if it's
* just a protocol response (i.e. non negative), that gets
* processed above.
*/
if (!ret && rq->errors < 0)
ret = rq->errors;
blk_rq_unmap_user(bio);
if (rq->cmd != rq->__cmd)
kfree(rq->cmd);
blk_put_request(rq);
return ret;
}
static int bsg_complete_all_commands(struct bsg_device *bd)
{
struct bsg_command *bc;
int ret, tret;
dprintk("%s: entered\n", bd->name);
/*
* wait for all commands to complete
*/
ret = 0;
do {
ret = bsg_io_schedule(bd);
/*
* look for -ENODATA specifically -- we'll sometimes get
* -ERESTARTSYS when we've taken a signal, but we can't
* return until we're done freeing the queue, so ignore
* it. The signal will get handled when we're done freeing
* the bsg_device.
*/
} while (ret != -ENODATA);
/*
* discard done commands
*/
ret = 0;
do {
spin_lock_irq(&bd->lock);
if (!bd->queued_cmds) {
spin_unlock_irq(&bd->lock);
break;
}
spin_unlock_irq(&bd->lock);
bc = bsg_get_done_cmd(bd);
if (IS_ERR(bc))
break;
tret = blk_complete_sgv4_hdr_rq(bc->rq, &bc->hdr, bc->bio,
bc->bidi_bio);
if (!ret)
ret = tret;
bsg_free_command(bc);
} while (1);
return ret;
}
static int
__bsg_read(char __user *buf, size_t count, struct bsg_device *bd,
const struct iovec *iov, ssize_t *bytes_read)
{
struct bsg_command *bc;
int nr_commands, ret;
if (count % sizeof(struct sg_io_v4))
return -EINVAL;
ret = 0;
nr_commands = count / sizeof(struct sg_io_v4);
while (nr_commands) {
bc = bsg_get_done_cmd(bd);
if (IS_ERR(bc)) {
ret = PTR_ERR(bc);
break;
}
/*
* this is the only case where we need to copy data back
* after completing the request. so do that here,
* bsg_complete_work() cannot do that for us
*/
ret = blk_complete_sgv4_hdr_rq(bc->rq, &bc->hdr, bc->bio,
bc->bidi_bio);
if (copy_to_user(buf, &bc->hdr, sizeof(bc->hdr)))
ret = -EFAULT;
bsg_free_command(bc);
if (ret)
break;
buf += sizeof(struct sg_io_v4);
*bytes_read += sizeof(struct sg_io_v4);
nr_commands--;
}
return ret;
}
static inline void bsg_set_block(struct bsg_device *bd, struct file *file)
{
if (file->f_flags & O_NONBLOCK)
clear_bit(BSG_F_BLOCK, &bd->flags);
else
set_bit(BSG_F_BLOCK, &bd->flags);
}
/*
* Check if the error is a "real" error that we should return.
*/
static inline int err_block_err(int ret)
{
if (ret && ret != -ENOSPC && ret != -ENODATA && ret != -EAGAIN)
return 1;
return 0;
}
static ssize_t
bsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
struct bsg_device *bd = file->private_data;
int ret;
ssize_t bytes_read;
dprintk("%s: read %Zd bytes\n", bd->name, count);
bsg_set_block(bd, file);
bytes_read = 0;
ret = __bsg_read(buf, count, bd, NULL, &bytes_read);
*ppos = bytes_read;
if (!bytes_read || (bytes_read && err_block_err(ret)))
bytes_read = ret;
return bytes_read;
}
static int __bsg_write(struct bsg_device *bd, const char __user *buf,
size_t count, ssize_t *bytes_written,
fmode_t has_write_perm)
{
struct bsg_command *bc;
struct request *rq;
int ret, nr_commands;
if (count % sizeof(struct sg_io_v4))
return -EINVAL;
nr_commands = count / sizeof(struct sg_io_v4);
rq = NULL;
bc = NULL;
ret = 0;
while (nr_commands) {
struct request_queue *q = bd->queue;
bc = bsg_alloc_command(bd);
if (IS_ERR(bc)) {
ret = PTR_ERR(bc);
bc = NULL;
break;
}
if (copy_from_user(&bc->hdr, buf, sizeof(bc->hdr))) {
ret = -EFAULT;
break;
}
/*
* get a request, fill in the blanks, and add to request queue
*/
rq = bsg_map_hdr(bd, &bc->hdr, has_write_perm, bc->sense);
if (IS_ERR(rq)) {
ret = PTR_ERR(rq);
rq = NULL;
break;
}
bsg_add_command(bd, q, bc, rq);
bc = NULL;
rq = NULL;
nr_commands--;
buf += sizeof(struct sg_io_v4);
*bytes_written += sizeof(struct sg_io_v4);
}
if (bc)
bsg_free_command(bc);
return ret;
}
static ssize_t
bsg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
{
struct bsg_device *bd = file->private_data;
ssize_t bytes_written;
int ret;
dprintk("%s: write %Zd bytes\n", bd->name, count);
bsg_set_block(bd, file);
bytes_written = 0;
ret = __bsg_write(bd, buf, count, &bytes_written,
file->f_mode & FMODE_WRITE);
*ppos = bytes_written;
/*
* return bytes written on non-fatal errors
*/
if (!bytes_written || (bytes_written && err_block_err(ret)))
bytes_written = ret;
dprintk("%s: returning %Zd\n", bd->name, bytes_written);
return bytes_written;
}
static struct bsg_device *bsg_alloc_device(void)
{
struct bsg_device *bd;
bd = kzalloc(sizeof(struct bsg_device), GFP_KERNEL);
if (unlikely(!bd))
return NULL;
spin_lock_init(&bd->lock);
bd->max_queue = BSG_DEFAULT_CMDS;
INIT_LIST_HEAD(&bd->busy_list);
INIT_LIST_HEAD(&bd->done_list);
INIT_HLIST_NODE(&bd->dev_list);
init_waitqueue_head(&bd->wq_free);
init_waitqueue_head(&bd->wq_done);
return bd;
}
static void bsg_kref_release_function(struct kref *kref)
{
struct bsg_class_device *bcd =
container_of(kref, struct bsg_class_device, ref);
struct device *parent = bcd->parent;
if (bcd->release)
bcd->release(bcd->parent);
put_device(parent);
}
static int bsg_put_device(struct bsg_device *bd)
{
int ret = 0, do_free;
struct request_queue *q = bd->queue;
mutex_lock(&bsg_mutex);
do_free = atomic_dec_and_test(&bd->ref_count);
if (!do_free) {
mutex_unlock(&bsg_mutex);
goto out;
}
hlist_del(&bd->dev_list);
mutex_unlock(&bsg_mutex);
dprintk("%s: tearing down\n", bd->name);
/*
* close can always block
*/
set_bit(BSG_F_BLOCK, &bd->flags);
/*
* correct error detection baddies here again. it's the responsibility
* of the app to properly reap commands before close() if it wants
* fool-proof error detection
*/
ret = bsg_complete_all_commands(bd);
kfree(bd);
out:
kref_put(&q->bsg_dev.ref, bsg_kref_release_function);
if (do_free)
blk_put_queue(q);
return ret;
}
static struct bsg_device *bsg_add_device(struct inode *inode,
struct request_queue *rq,
struct file *file)
{
struct bsg_device *bd;
int ret;
#ifdef BSG_DEBUG
unsigned char buf[32];
#endif
ret = blk_get_queue(rq);
if (ret)
return ERR_PTR(-ENXIO);
bd = bsg_alloc_device();
if (!bd) {
blk_put_queue(rq);
return ERR_PTR(-ENOMEM);
}
bd->queue = rq;
bsg_set_block(bd, file);
atomic_set(&bd->ref_count, 1);
mutex_lock(&bsg_mutex);
hlist_add_head(&bd->dev_list, bsg_dev_idx_hash(iminor(inode)));
strncpy(bd->name, dev_name(rq->bsg_dev.class_dev), sizeof(bd->name) - 1);
dprintk("bound to <%s>, max queue %d\n",
format_dev_t(buf, inode->i_rdev), bd->max_queue);
mutex_unlock(&bsg_mutex);
return bd;
}
static struct bsg_device *__bsg_get_device(int minor, struct request_queue *q)
{
struct bsg_device *bd;
struct hlist_node *entry;
mutex_lock(&bsg_mutex);
hlist_for_each_entry(bd, entry, bsg_dev_idx_hash(minor), dev_list) {
if (bd->queue == q) {
atomic_inc(&bd->ref_count);
goto found;
}
}
bd = NULL;
found:
mutex_unlock(&bsg_mutex);
return bd;
}
static struct bsg_device *bsg_get_device(struct inode *inode, struct file *file)
{
struct bsg_device *bd;
struct bsg_class_device *bcd;
/*
* find the class device
*/
mutex_lock(&bsg_mutex);
bcd = idr_find(&bsg_minor_idr, iminor(inode));
if (bcd)
kref_get(&bcd->ref);
mutex_unlock(&bsg_mutex);
if (!bcd)
return ERR_PTR(-ENODEV);
bd = __bsg_get_device(iminor(inode), bcd->queue);
if (bd)
return bd;
bd = bsg_add_device(inode, bcd->queue, file);
if (IS_ERR(bd))
kref_put(&bcd->ref, bsg_kref_release_function);
return bd;
}
static int bsg_open(struct inode *inode, struct file *file)
{
struct bsg_device *bd;
bd = bsg_get_device(inode, file);
if (IS_ERR(bd))
return PTR_ERR(bd);
file->private_data = bd;
return 0;
}
static int bsg_release(struct inode *inode, struct file *file)
{
struct bsg_device *bd = file->private_data;
file->private_data = NULL;
return bsg_put_device(bd);
}
static unsigned int bsg_poll(struct file *file, poll_table *wait)
{
struct bsg_device *bd = file->private_data;
unsigned int mask = 0;
poll_wait(file, &bd->wq_done, wait);
poll_wait(file, &bd->wq_free, wait);
spin_lock_irq(&bd->lock);
if (!list_empty(&bd->done_list))
mask |= POLLIN | POLLRDNORM;
if (bd->queued_cmds >= bd->max_queue)
mask |= POLLOUT;
spin_unlock_irq(&bd->lock);
return mask;
}
static long bsg_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
struct bsg_device *bd = file->private_data;
int __user *uarg = (int __user *) arg;
int ret;
switch (cmd) {
/*
* our own ioctls
*/
case SG_GET_COMMAND_Q:
return put_user(bd->max_queue, uarg);
case SG_SET_COMMAND_Q: {
int queue;
if (get_user(queue, uarg))
return -EFAULT;
if (queue < 1)
return -EINVAL;
spin_lock_irq(&bd->lock);
bd->max_queue = queue;
spin_unlock_irq(&bd->lock);
return 0;
}
/*
* SCSI/sg ioctls
*/
case SG_GET_VERSION_NUM:
case SCSI_IOCTL_GET_IDLUN:
case SCSI_IOCTL_GET_BUS_NUMBER:
case SG_SET_TIMEOUT:
case SG_GET_TIMEOUT:
case SG_GET_RESERVED_SIZE:
case SG_SET_RESERVED_SIZE:
case SG_EMULATED_HOST:
case SCSI_IOCTL_SEND_COMMAND: {
void __user *uarg = (void __user *) arg;
return scsi_cmd_ioctl(bd->queue, NULL, file->f_mode, cmd, uarg);
}
case SG_IO: {
struct request *rq;
struct bio *bio, *bidi_bio = NULL;
struct sg_io_v4 hdr;
int at_head;
u8 sense[SCSI_SENSE_BUFFERSIZE];
if (copy_from_user(&hdr, uarg, sizeof(hdr)))
return -EFAULT;
rq = bsg_map_hdr(bd, &hdr, file->f_mode & FMODE_WRITE, sense);
if (IS_ERR(rq))
return PTR_ERR(rq);
bio = rq->bio;
if (rq->next_rq)
bidi_bio = rq->next_rq->bio;
at_head = (0 == (hdr.flags & BSG_FLAG_Q_AT_TAIL));
blk_execute_rq(bd->queue, NULL, rq, at_head);
ret = blk_complete_sgv4_hdr_rq(rq, &hdr, bio, bidi_bio);
if (copy_to_user(uarg, &hdr, sizeof(hdr)))
return -EFAULT;
return ret;
}
/*
* block device ioctls
*/
default:
#if 0
return ioctl_by_bdev(bd->bdev, cmd, arg);
#else
return -ENOTTY;
#endif
}
}
static const struct file_operations bsg_fops = {
.read = bsg_read,
.write = bsg_write,
.poll = bsg_poll,
.open = bsg_open,
.release = bsg_release,
.unlocked_ioctl = bsg_ioctl,
.owner = THIS_MODULE,
llseek: automatically add .llseek fop All file_operations should get a .llseek operation so we can make nonseekable_open the default for future file operations without a .llseek pointer. The three cases that we can automatically detect are no_llseek, seq_lseek and default_llseek. For cases where we can we can automatically prove that the file offset is always ignored, we use noop_llseek, which maintains the current behavior of not returning an error from a seek. New drivers should normally not use noop_llseek but instead use no_llseek and call nonseekable_open at open time. Existing drivers can be converted to do the same when the maintainer knows for certain that no user code relies on calling seek on the device file. The generated code is often incorrectly indented and right now contains comments that clarify for each added line why a specific variant was chosen. In the version that gets submitted upstream, the comments will be gone and I will manually fix the indentation, because there does not seem to be a way to do that using coccinelle. Some amount of new code is currently sitting in linux-next that should get the same modifications, which I will do at the end of the merge window. Many thanks to Julia Lawall for helping me learn to write a semantic patch that does all this. ===== begin semantic patch ===== // This adds an llseek= method to all file operations, // as a preparation for making no_llseek the default. // // The rules are // - use no_llseek explicitly if we do nonseekable_open // - use seq_lseek for sequential files // - use default_llseek if we know we access f_pos // - use noop_llseek if we know we don't access f_pos, // but we still want to allow users to call lseek // @ open1 exists @ identifier nested_open; @@ nested_open(...) { <+... nonseekable_open(...) ...+> } @ open exists@ identifier open_f; identifier i, f; identifier open1.nested_open; @@ int open_f(struct inode *i, struct file *f) { <+... ( nonseekable_open(...) | nested_open(...) ) ...+> } @ read disable optional_qualifier exists @ identifier read_f; identifier f, p, s, off; type ssize_t, size_t, loff_t; expression E; identifier func; @@ ssize_t read_f(struct file *f, char *p, size_t s, loff_t *off) { <+... ( *off = E | *off += E | func(..., off, ...) | E = *off ) ...+> } @ read_no_fpos disable optional_qualifier exists @ identifier read_f; identifier f, p, s, off; type ssize_t, size_t, loff_t; @@ ssize_t read_f(struct file *f, char *p, size_t s, loff_t *off) { ... when != off } @ write @ identifier write_f; identifier f, p, s, off; type ssize_t, size_t, loff_t; expression E; identifier func; @@ ssize_t write_f(struct file *f, const char *p, size_t s, loff_t *off) { <+... ( *off = E | *off += E | func(..., off, ...) | E = *off ) ...+> } @ write_no_fpos @ identifier write_f; identifier f, p, s, off; type ssize_t, size_t, loff_t; @@ ssize_t write_f(struct file *f, const char *p, size_t s, loff_t *off) { ... when != off } @ fops0 @ identifier fops; @@ struct file_operations fops = { ... }; @ has_llseek depends on fops0 @ identifier fops0.fops; identifier llseek_f; @@ struct file_operations fops = { ... .llseek = llseek_f, ... }; @ has_read depends on fops0 @ identifier fops0.fops; identifier read_f; @@ struct file_operations fops = { ... .read = read_f, ... }; @ has_write depends on fops0 @ identifier fops0.fops; identifier write_f; @@ struct file_operations fops = { ... .write = write_f, ... }; @ has_open depends on fops0 @ identifier fops0.fops; identifier open_f; @@ struct file_operations fops = { ... .open = open_f, ... }; // use no_llseek if we call nonseekable_open //////////////////////////////////////////// @ nonseekable1 depends on !has_llseek && has_open @ identifier fops0.fops; identifier nso ~= "nonseekable_open"; @@ struct file_operations fops = { ... .open = nso, ... +.llseek = no_llseek, /* nonseekable */ }; @ nonseekable2 depends on !has_llseek @ identifier fops0.fops; identifier open.open_f; @@ struct file_operations fops = { ... .open = open_f, ... +.llseek = no_llseek, /* open uses nonseekable */ }; // use seq_lseek for sequential files ///////////////////////////////////// @ seq depends on !has_llseek @ identifier fops0.fops; identifier sr ~= "seq_read"; @@ struct file_operations fops = { ... .read = sr, ... +.llseek = seq_lseek, /* we have seq_read */ }; // use default_llseek if there is a readdir /////////////////////////////////////////// @ fops1 depends on !has_llseek && !nonseekable1 && !nonseekable2 && !seq @ identifier fops0.fops; identifier readdir_e; @@ // any other fop is used that changes pos struct file_operations fops = { ... .readdir = readdir_e, ... +.llseek = default_llseek, /* readdir is present */ }; // use default_llseek if at least one of read/write touches f_pos ///////////////////////////////////////////////////////////////// @ fops2 depends on !fops1 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @ identifier fops0.fops; identifier read.read_f; @@ // read fops use offset struct file_operations fops = { ... .read = read_f, ... +.llseek = default_llseek, /* read accesses f_pos */ }; @ fops3 depends on !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @ identifier fops0.fops; identifier write.write_f; @@ // write fops use offset struct file_operations fops = { ... .write = write_f, ... + .llseek = default_llseek, /* write accesses f_pos */ }; // Use noop_llseek if neither read nor write accesses f_pos /////////////////////////////////////////////////////////// @ fops4 depends on !fops1 && !fops2 && !fops3 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @ identifier fops0.fops; identifier read_no_fpos.read_f; identifier write_no_fpos.write_f; @@ // write fops use offset struct file_operations fops = { ... .write = write_f, .read = read_f, ... +.llseek = noop_llseek, /* read and write both use no f_pos */ }; @ depends on has_write && !has_read && !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @ identifier fops0.fops; identifier write_no_fpos.write_f; @@ struct file_operations fops = { ... .write = write_f, ... +.llseek = noop_llseek, /* write uses no f_pos */ }; @ depends on has_read && !has_write && !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @ identifier fops0.fops; identifier read_no_fpos.read_f; @@ struct file_operations fops = { ... .read = read_f, ... +.llseek = noop_llseek, /* read uses no f_pos */ }; @ depends on !has_read && !has_write && !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @ identifier fops0.fops; @@ struct file_operations fops = { ... +.llseek = noop_llseek, /* no read or write fn */ }; ===== End semantic patch ===== Signed-off-by: Arnd Bergmann <arnd@arndb.de> Cc: Julia Lawall <julia@diku.dk> Cc: Christoph Hellwig <hch@infradead.org>
2010-08-16 00:52:59 +08:00
.llseek = default_llseek,
};
void bsg_unregister_queue(struct request_queue *q)
{
struct bsg_class_device *bcd = &q->bsg_dev;
if (!bcd->class_dev)
return;
mutex_lock(&bsg_mutex);
idr_remove(&bsg_minor_idr, bcd->minor);
sysfs_remove_link(&q->kobj, "bsg");
device_unregister(bcd->class_dev);
bcd->class_dev = NULL;
kref_put(&bcd->ref, bsg_kref_release_function);
mutex_unlock(&bsg_mutex);
}
EXPORT_SYMBOL_GPL(bsg_unregister_queue);
int bsg_register_queue(struct request_queue *q, struct device *parent,
const char *name, void (*release)(struct device *))
{
struct bsg_class_device *bcd;
dev_t dev;
int ret, minor;
struct device *class_dev = NULL;
const char *devname;
if (name)
devname = name;
else
devname = dev_name(parent);
/*
* we need a proper transport to send commands, not a stacked device
*/
if (!q->request_fn)
return 0;
bcd = &q->bsg_dev;
memset(bcd, 0, sizeof(*bcd));
mutex_lock(&bsg_mutex);
ret = idr_pre_get(&bsg_minor_idr, GFP_KERNEL);
if (!ret) {
ret = -ENOMEM;
goto unlock;
}
ret = idr_get_new(&bsg_minor_idr, bcd, &minor);
if (ret < 0)
goto unlock;
if (minor >= BSG_MAX_DEVS) {
printk(KERN_ERR "bsg: too many bsg devices\n");
ret = -EINVAL;
goto remove_idr;
}
bcd->minor = minor;
bcd->queue = q;
bcd->parent = get_device(parent);
bcd->release = release;
kref_init(&bcd->ref);
dev = MKDEV(bsg_major, bcd->minor);
class_dev = device_create(bsg_class, parent, dev, NULL, "%s", devname);
if (IS_ERR(class_dev)) {
ret = PTR_ERR(class_dev);
goto put_dev;
}
bcd->class_dev = class_dev;
if (q->kobj.sd) {
ret = sysfs_create_link(&q->kobj, &bcd->class_dev->kobj, "bsg");
if (ret)
goto unregister_class_dev;
}
mutex_unlock(&bsg_mutex);
return 0;
unregister_class_dev:
device_unregister(class_dev);
put_dev:
put_device(parent);
remove_idr:
idr_remove(&bsg_minor_idr, minor);
unlock:
mutex_unlock(&bsg_mutex);
return ret;
}
EXPORT_SYMBOL_GPL(bsg_register_queue);
static struct cdev bsg_cdev;
static char *bsg_devnode(struct device *dev, mode_t *mode)
{
return kasprintf(GFP_KERNEL, "bsg/%s", dev_name(dev));
}
static int __init bsg_init(void)
{
int ret, i;
dev_t devid;
bsg_cmd_cachep = kmem_cache_create("bsg_cmd",
sizeof(struct bsg_command), 0, 0, NULL);
if (!bsg_cmd_cachep) {
printk(KERN_ERR "bsg: failed creating slab cache\n");
return -ENOMEM;
}
for (i = 0; i < BSG_LIST_ARRAY_SIZE; i++)
INIT_HLIST_HEAD(&bsg_device_list[i]);
bsg_class = class_create(THIS_MODULE, "bsg");
if (IS_ERR(bsg_class)) {
ret = PTR_ERR(bsg_class);
goto destroy_kmemcache;
}
bsg_class->devnode = bsg_devnode;
ret = alloc_chrdev_region(&devid, 0, BSG_MAX_DEVS, "bsg");
if (ret)
goto destroy_bsg_class;
bsg_major = MAJOR(devid);
cdev_init(&bsg_cdev, &bsg_fops);
ret = cdev_add(&bsg_cdev, MKDEV(bsg_major, 0), BSG_MAX_DEVS);
if (ret)
goto unregister_chrdev;
printk(KERN_INFO BSG_DESCRIPTION " version " BSG_VERSION
" loaded (major %d)\n", bsg_major);
return 0;
unregister_chrdev:
unregister_chrdev_region(MKDEV(bsg_major, 0), BSG_MAX_DEVS);
destroy_bsg_class:
class_destroy(bsg_class);
destroy_kmemcache:
kmem_cache_destroy(bsg_cmd_cachep);
return ret;
}
MODULE_AUTHOR("Jens Axboe");
MODULE_DESCRIPTION(BSG_DESCRIPTION);
MODULE_LICENSE("GPL");
device_initcall(bsg_init);