From ec924b4726e3df000d3ac7ae10cb8ef1adcd60ca Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 17 Jul 2006 18:20:51 +0300 Subject: [PATCH 01/67] IB/uverbs: Fix unlocking in error paths ib_uverbs_create_ah() and ib_uverbs_create_srq() did not release the PD's read lock in their error paths, which lead to deadlock when destroying the PD. Signed-off-by: Michael S. Tsirkin Signed-off-by: Roland Dreier --- drivers/infiniband/core/uverbs_cmd.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index bdf5d5098190..0371806cf398 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -1775,7 +1775,7 @@ ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file, ah = ib_create_ah(pd, &attr); if (IS_ERR(ah)) { ret = PTR_ERR(ah); - goto err; + goto err_put; } ah->uobject = uobj; @@ -1811,6 +1811,9 @@ ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file, err_destroy: ib_destroy_ah(ah); +err_put: + put_pd_read(pd); + err: put_uobj_write(uobj); return ret; @@ -1984,7 +1987,7 @@ ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file, srq = pd->device->create_srq(pd, &attr, &udata); if (IS_ERR(srq)) { ret = PTR_ERR(srq); - goto err; + goto err_put; } srq->device = pd->device; @@ -2029,6 +2032,9 @@ ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file, err_destroy: ib_destroy_srq(srq); +err_put: + put_pd_read(pd); + err: put_uobj_write(&obj->uobject); return ret; From 43db2bc04409b1e1b74f9768e3284cec18a87d0b Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Sun, 23 Jul 2006 15:16:04 -0700 Subject: [PATCH 02/67] IB/uverbs: Fix lockdep warnings Lockdep warns because uverbs is trying to take uobj->mutex when it already holds that lock. This is because there are really multiple types of uobjs even though all of their locks are initialized in common code. Signed-off-by: Roland Dreier --- drivers/infiniband/core/uverbs_cmd.c | 32 ++++++++++++++++++---------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 0371806cf398..30923eb68ec7 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -42,6 +42,13 @@ #include "uverbs.h" +static struct lock_class_key pd_lock_key; +static struct lock_class_key mr_lock_key; +static struct lock_class_key cq_lock_key; +static struct lock_class_key qp_lock_key; +static struct lock_class_key ah_lock_key; +static struct lock_class_key srq_lock_key; + #define INIT_UDATA(udata, ibuf, obuf, ilen, olen) \ do { \ (udata)->inbuf = (void __user *) (ibuf); \ @@ -76,12 +83,13 @@ */ static void init_uobj(struct ib_uobject *uobj, u64 user_handle, - struct ib_ucontext *context) + struct ib_ucontext *context, struct lock_class_key *key) { uobj->user_handle = user_handle; uobj->context = context; kref_init(&uobj->ref); init_rwsem(&uobj->mutex); + lockdep_set_class(&uobj->mutex, key); uobj->live = 0; } @@ -470,7 +478,7 @@ ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file, if (!uobj) return -ENOMEM; - init_uobj(uobj, 0, file->ucontext); + init_uobj(uobj, 0, file->ucontext, &pd_lock_key); down_write(&uobj->mutex); pd = file->device->ib_dev->alloc_pd(file->device->ib_dev, @@ -591,7 +599,7 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file, if (!obj) return -ENOMEM; - init_uobj(&obj->uobject, 0, file->ucontext); + init_uobj(&obj->uobject, 0, file->ucontext, &mr_lock_key); down_write(&obj->uobject.mutex); /* @@ -770,7 +778,7 @@ ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file, if (!obj) return -ENOMEM; - init_uobj(&obj->uobject, cmd.user_handle, file->ucontext); + init_uobj(&obj->uobject, cmd.user_handle, file->ucontext, &cq_lock_key); down_write(&obj->uobject.mutex); if (cmd.comp_channel >= 0) { @@ -1051,13 +1059,14 @@ ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file, if (!obj) return -ENOMEM; - init_uobj(&obj->uevent.uobject, cmd.user_handle, file->ucontext); + init_uobj(&obj->uevent.uobject, cmd.user_handle, file->ucontext, &qp_lock_key); down_write(&obj->uevent.uobject.mutex); + srq = cmd.is_srq ? idr_read_srq(cmd.srq_handle, file->ucontext) : NULL; pd = idr_read_pd(cmd.pd_handle, file->ucontext); scq = idr_read_cq(cmd.send_cq_handle, file->ucontext); - rcq = idr_read_cq(cmd.recv_cq_handle, file->ucontext); - srq = cmd.is_srq ? idr_read_srq(cmd.srq_handle, file->ucontext) : NULL; + rcq = cmd.recv_cq_handle == cmd.send_cq_handle ? + scq : idr_read_cq(cmd.recv_cq_handle, file->ucontext); if (!pd || !scq || !rcq || (cmd.is_srq && !srq)) { ret = -EINVAL; @@ -1125,7 +1134,8 @@ ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file, put_pd_read(pd); put_cq_read(scq); - put_cq_read(rcq); + if (rcq != scq) + put_cq_read(rcq); if (srq) put_srq_read(srq); @@ -1150,7 +1160,7 @@ ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file, put_pd_read(pd); if (scq) put_cq_read(scq); - if (rcq) + if (rcq && rcq != scq) put_cq_read(rcq); if (srq) put_srq_read(srq); @@ -1751,7 +1761,7 @@ ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file, if (!uobj) return -ENOMEM; - init_uobj(uobj, cmd.user_handle, file->ucontext); + init_uobj(uobj, cmd.user_handle, file->ucontext, &ah_lock_key); down_write(&uobj->mutex); pd = idr_read_pd(cmd.pd_handle, file->ucontext); @@ -1966,7 +1976,7 @@ ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file, if (!obj) return -ENOMEM; - init_uobj(&obj->uobject, cmd.user_handle, file->ucontext); + init_uobj(&obj->uobject, cmd.user_handle, file->ucontext, &srq_lock_key); down_write(&obj->uobject.mutex); pd = idr_read_pd(cmd.pd_handle, file->ucontext); From 1252c517cf3df240ae51946a096035765dfd2e6d Mon Sep 17 00:00:00 2001 From: Dotan Barak Date: Thu, 13 Jul 2006 11:05:49 +0300 Subject: [PATCH 03/67] IB/mthca: Fix SRQ limit event range check Mem-free HCAs always keep one spare SRQ WQE, so the SRQ limit cannot be set beyond srq->max - 1. Signed-off-by: Dotan Barak Signed-off-by: Michael S. Tsirkin Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mthca/mthca_srq.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mthca/mthca_srq.c b/drivers/infiniband/hw/mthca/mthca_srq.c index fab417c5cf43..b60a9d79ae54 100644 --- a/drivers/infiniband/hw/mthca/mthca_srq.c +++ b/drivers/infiniband/hw/mthca/mthca_srq.c @@ -370,7 +370,8 @@ int mthca_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, return -EINVAL; if (attr_mask & IB_SRQ_LIMIT) { - if (attr->srq_limit > srq->max) + u32 max_wr = mthca_is_memfree(dev) ? srq->max - 1 : srq->max; + if (attr->srq_limit > max_wr) return -EINVAL; mutex_lock(&srq->mutex); From 3d37b9e209136cf178562bbedc7cd2ecb1da8beb Mon Sep 17 00:00:00 2001 From: Ralph Campbell Date: Mon, 17 Jul 2006 18:18:36 -0700 Subject: [PATCH 04/67] IB/ipath: Fix a data corruption This patch fixes a problem where certain error packets are passed to the InfiniBand layer for processing even though the packet actually was received with an error. Signed-off-by: Ralph Campbell Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ipath/ipath_driver.c | 76 ++++++++++------------ 1 file changed, 36 insertions(+), 40 deletions(-) diff --git a/drivers/infiniband/hw/ipath/ipath_driver.c b/drivers/infiniband/hw/ipath/ipath_driver.c index 823131d58b34..f98518d912b5 100644 --- a/drivers/infiniband/hw/ipath/ipath_driver.c +++ b/drivers/infiniband/hw/ipath/ipath_driver.c @@ -859,6 +859,38 @@ static void ipath_rcv_layer(struct ipath_devdata *dd, u32 etail, __ipath_layer_rcv_lid(dd, hdr); } +static void ipath_rcv_hdrerr(struct ipath_devdata *dd, + u32 eflags, + u32 l, + u32 etail, + u64 *rc) +{ + char emsg[128]; + struct ipath_message_header *hdr; + + get_rhf_errstring(eflags, emsg, sizeof emsg); + hdr = (struct ipath_message_header *)&rc[1]; + ipath_cdbg(PKT, "RHFerrs %x hdrqtail=%x typ=%u " + "tlen=%x opcode=%x egridx=%x: %s\n", + eflags, l, + ipath_hdrget_rcv_type((__le32 *) rc), + ipath_hdrget_length_in_bytes((__le32 *) rc), + be32_to_cpu(hdr->bth[0]) >> 24, + etail, emsg); + + /* Count local link integrity errors. */ + if (eflags & (INFINIPATH_RHF_H_ICRCERR | INFINIPATH_RHF_H_VCRCERR)) { + u8 n = (dd->ipath_ibcctrl >> + INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT) & + INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK; + + if (++dd->ipath_lli_counter > n) { + dd->ipath_lli_counter = 0; + dd->ipath_lli_errors++; + } + } +} + /* * ipath_kreceive - receive a packet * @dd: the infinipath device @@ -875,7 +907,6 @@ void ipath_kreceive(struct ipath_devdata *dd) struct ipath_message_header *hdr; u32 eflags, i, etype, tlen, pkttot = 0, updegr=0, reloop=0; static u64 totcalls; /* stats, may eventually remove */ - char emsg[128]; if (!dd->ipath_hdrqtailptr) { ipath_dev_err(dd, @@ -938,26 +969,9 @@ void ipath_kreceive(struct ipath_devdata *dd) "%x\n", etype); } - if (eflags & ~(INFINIPATH_RHF_H_TIDERR | - INFINIPATH_RHF_H_IHDRERR)) { - get_rhf_errstring(eflags, emsg, sizeof emsg); - ipath_cdbg(PKT, "RHFerrs %x hdrqtail=%x typ=%u " - "tlen=%x opcode=%x egridx=%x: %s\n", - eflags, l, etype, tlen, bthbytes[0], - ipath_hdrget_index((__le32 *) rc), emsg); - /* Count local link integrity errors. */ - if (eflags & (INFINIPATH_RHF_H_ICRCERR | - INFINIPATH_RHF_H_VCRCERR)) { - u8 n = (dd->ipath_ibcctrl >> - INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT) & - INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK; - - if (++dd->ipath_lli_counter > n) { - dd->ipath_lli_counter = 0; - dd->ipath_lli_errors++; - } - } - } else if (etype == RCVHQ_RCV_TYPE_NON_KD) { + if (unlikely(eflags)) + ipath_rcv_hdrerr(dd, eflags, l, etail, rc); + else if (etype == RCVHQ_RCV_TYPE_NON_KD) { int ret = __ipath_verbs_rcv(dd, rc + 1, ebuf, tlen); if (ret == -ENODEV) @@ -981,25 +995,7 @@ void ipath_kreceive(struct ipath_devdata *dd) else if (etype == RCVHQ_RCV_TYPE_EXPECTED) ipath_dbg("Bug: Expected TID, opcode %x; ignored\n", be32_to_cpu(hdr->bth[0]) & 0xff); - else if (eflags & (INFINIPATH_RHF_H_TIDERR | - INFINIPATH_RHF_H_IHDRERR)) { - /* - * This is a type 3 packet, only the LRH is in the - * rcvhdrq, the rest of the header is in the eager - * buffer. - */ - u8 opcode; - if (ebuf) { - bthbytes = (u8 *) ebuf; - opcode = *bthbytes; - } - else - opcode = 0; - get_rhf_errstring(eflags, emsg, sizeof emsg); - ipath_dbg("Err %x (%s), opcode %x, egrbuf %x, " - "len %x\n", eflags, emsg, opcode, etail, - tlen); - } else { + else { /* * error packet, type of error unknown. * Probably type 3, but we don't know, so don't From c9f79bdc21da9c8d466b6ba7c8bbd6b8e0110ce2 Mon Sep 17 00:00:00 2001 From: Ralph Campbell Date: Mon, 17 Jul 2006 18:19:54 -0700 Subject: [PATCH 05/67] IB/ipath: Fix ib_ipath driver to work with SRP I am still working on a proposal to remove the phys_to_virt() calls in the ib_ipath driver. In the mean time, this patch allows SRP to work by fixing the R_Key check and conversion from IB address to kernel virtual address. It also returns the correct page size for FMRs. Signed-off-by: Ralph Campbell Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ipath/ipath_keys.c | 15 +++++++++++++++ drivers/infiniband/hw/ipath/ipath_verbs.c | 1 + 2 files changed, 16 insertions(+) diff --git a/drivers/infiniband/hw/ipath/ipath_keys.c b/drivers/infiniband/hw/ipath/ipath_keys.c index 46773c673a1a..a5ca279370aa 100644 --- a/drivers/infiniband/hw/ipath/ipath_keys.c +++ b/drivers/infiniband/hw/ipath/ipath_keys.c @@ -197,6 +197,21 @@ int ipath_rkey_ok(struct ipath_ibdev *dev, struct ipath_sge_state *ss, size_t off; int ret; + /* + * We use RKEY == zero for physical addresses + * (see ipath_get_dma_mr). + */ + if (rkey == 0) { + sge->mr = NULL; + sge->vaddr = phys_to_virt(vaddr); + sge->length = len; + sge->sge_length = len; + ss->sg_list = NULL; + ss->num_sge = 1; + ret = 1; + goto bail; + } + mr = rkt->table[(rkey >> (32 - ib_ipath_lkey_table_size))]; if (unlikely(mr == NULL || mr->lkey != rkey)) { ret = 0; diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.c b/drivers/infiniband/hw/ipath/ipath_verbs.c index 56ac336dd1ec..70bce7a8d538 100644 --- a/drivers/infiniband/hw/ipath/ipath_verbs.c +++ b/drivers/infiniband/hw/ipath/ipath_verbs.c @@ -627,6 +627,7 @@ static int ipath_query_device(struct ib_device *ibdev, props->device_cap_flags = IB_DEVICE_BAD_PKEY_CNTR | IB_DEVICE_BAD_QKEY_CNTR | IB_DEVICE_SHUTDOWN_PORT | IB_DEVICE_SYS_IMAGE_GUID; + props->page_size_cap = PAGE_SIZE; props->vendor_id = ipath_layer_get_vendorid(dev->dd); props->vendor_part_id = ipath_layer_get_deviceid(dev->dd); props->hw_ver = ipath_layer_get_pcirev(dev->dd); From 16c59419a09f0140a07a1828d6a45656265e07c7 Mon Sep 17 00:00:00 2001 From: Ralph Campbell Date: Mon, 17 Jul 2006 18:21:24 -0700 Subject: [PATCH 06/67] IB/ipath: ipath_skip_sge() can break if num_sge > 1 ipath_skip_sge() doesn't exactly duplicate the side effects of ipath_copy_sge() if num_sge > 1 since it doesn't decrement ss->num_sge. This could result in the sg_list being accessed out of bounds. Since ipath_skip_sge() is almost always called with num_sge == 1, the original "optimization" is almost never used. Signed-off-by: Ralph Campbell Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ipath/ipath_verbs.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.c b/drivers/infiniband/hw/ipath/ipath_verbs.c index 70bce7a8d538..d70a9b6b5239 100644 --- a/drivers/infiniband/hw/ipath/ipath_verbs.c +++ b/drivers/infiniband/hw/ipath/ipath_verbs.c @@ -191,10 +191,6 @@ void ipath_skip_sge(struct ipath_sge_state *ss, u32 length) { struct ipath_sge *sge = &ss->sge; - while (length > sge->sge_length) { - length -= sge->sge_length; - ss->sge = *ss->sg_list++; - } while (length) { u32 len = sge->length; From 2527e681fd4fd4231c2e04f09d7b04d3cab8eefe Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Thu, 20 Jul 2006 11:25:50 +0300 Subject: [PATCH 07/67] IB/mad: Validate MADs for spec compliance Validate MADs sent by userspace clients for spec compliance with C13-18.1.1 (prevent duplicate requests and responses sent on the same port). Without this, RMPP transactions get aborted because of duplicate packets. This patch is similar to that provided by Jack Morgenstein. Signed-off-by: Sean Hefty Signed-off-by: Michael S. Tsirkin Signed-off-by: Jack Morgenstein Signed-off-by: Roland Dreier --- drivers/infiniband/core/mad.c | 22 ++++---- drivers/infiniband/core/user_mad.c | 87 ++++++++++++++++++++++++++---- include/rdma/ib_mad.h | 7 +++ 3 files changed, 95 insertions(+), 21 deletions(-) diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index 5ed4dab52a6f..1c3cfbbe6a97 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -167,6 +167,15 @@ static int is_vendor_method_in_use( return 0; } +int ib_response_mad(struct ib_mad *mad) +{ + return ((mad->mad_hdr.method & IB_MGMT_METHOD_RESP) || + (mad->mad_hdr.method == IB_MGMT_METHOD_TRAP_REPRESS) || + ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_BM) && + (mad->mad_hdr.attr_mod & IB_BM_ATTR_MOD_RESP))); +} +EXPORT_SYMBOL(ib_response_mad); + /* * ib_register_mad_agent - Register to send/receive MADs */ @@ -570,13 +579,6 @@ int ib_unregister_mad_agent(struct ib_mad_agent *mad_agent) } EXPORT_SYMBOL(ib_unregister_mad_agent); -static inline int response_mad(struct ib_mad *mad) -{ - /* Trap represses are responses although response bit is reset */ - return ((mad->mad_hdr.method == IB_MGMT_METHOD_TRAP_REPRESS) || - (mad->mad_hdr.method & IB_MGMT_METHOD_RESP)); -} - static void dequeue_mad(struct ib_mad_list_head *mad_list) { struct ib_mad_queue *mad_queue; @@ -723,7 +725,7 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv, switch (ret) { case IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY: - if (response_mad(&mad_priv->mad.mad) && + if (ib_response_mad(&mad_priv->mad.mad) && mad_agent_priv->agent.recv_handler) { local->mad_priv = mad_priv; local->recv_mad_agent = mad_agent_priv; @@ -1551,7 +1553,7 @@ find_mad_agent(struct ib_mad_port_private *port_priv, unsigned long flags; spin_lock_irqsave(&port_priv->reg_lock, flags); - if (response_mad(mad)) { + if (ib_response_mad(mad)) { u32 hi_tid; struct ib_mad_agent_private *entry; @@ -1799,7 +1801,7 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv, } /* Complete corresponding request */ - if (response_mad(mad_recv_wc->recv_buf.mad)) { + if (ib_response_mad(mad_recv_wc->recv_buf.mad)) { spin_lock_irqsave(&mad_agent_priv->lock, flags); mad_send_wr = ib_find_send_mad(mad_agent_priv, mad_recv_wc); if (!mad_send_wr) { diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index afe70a549c2f..1273f8807e84 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -112,8 +112,10 @@ struct ib_umad_device { struct ib_umad_file { struct ib_umad_port *port; struct list_head recv_list; + struct list_head send_list; struct list_head port_list; spinlock_t recv_lock; + spinlock_t send_lock; wait_queue_head_t recv_wait; struct ib_mad_agent *agent[IB_UMAD_MAX_AGENTS]; int agents_dead; @@ -177,12 +179,21 @@ static int queue_packet(struct ib_umad_file *file, return ret; } +static void dequeue_send(struct ib_umad_file *file, + struct ib_umad_packet *packet) + { + spin_lock_irq(&file->send_lock); + list_del(&packet->list); + spin_unlock_irq(&file->send_lock); + } + static void send_handler(struct ib_mad_agent *agent, struct ib_mad_send_wc *send_wc) { struct ib_umad_file *file = agent->context; struct ib_umad_packet *packet = send_wc->send_buf->context[0]; + dequeue_send(file, packet); ib_destroy_ah(packet->msg->ah); ib_free_send_mad(packet->msg); @@ -370,6 +381,51 @@ static int copy_rmpp_mad(struct ib_mad_send_buf *msg, const char __user *buf) return 0; } +static int same_destination(struct ib_user_mad_hdr *hdr1, + struct ib_user_mad_hdr *hdr2) +{ + if (!hdr1->grh_present && !hdr2->grh_present) + return (hdr1->lid == hdr2->lid); + + if (hdr1->grh_present && hdr2->grh_present) + return !memcmp(hdr1->gid, hdr2->gid, 16); + + return 0; +} + +static int is_duplicate(struct ib_umad_file *file, + struct ib_umad_packet *packet) +{ + struct ib_umad_packet *sent_packet; + struct ib_mad_hdr *sent_hdr, *hdr; + + hdr = (struct ib_mad_hdr *) packet->mad.data; + list_for_each_entry(sent_packet, &file->send_list, list) { + sent_hdr = (struct ib_mad_hdr *) sent_packet->mad.data; + + if ((hdr->tid != sent_hdr->tid) || + (hdr->mgmt_class != sent_hdr->mgmt_class)) + continue; + + /* + * No need to be overly clever here. If two new operations have + * the same TID, reject the second as a duplicate. This is more + * restrictive than required by the spec. + */ + if (!ib_response_mad((struct ib_mad *) hdr)) { + if (!ib_response_mad((struct ib_mad *) sent_hdr)) + return 1; + continue; + } else if (!ib_response_mad((struct ib_mad *) sent_hdr)) + continue; + + if (same_destination(&packet->mad.hdr, &sent_packet->mad.hdr)) + return 1; + } + + return 0; +} + static ssize_t ib_umad_write(struct file *filp, const char __user *buf, size_t count, loff_t *pos) { @@ -379,7 +435,6 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf, struct ib_ah_attr ah_attr; struct ib_ah *ah; struct ib_rmpp_mad *rmpp_mad; - u8 method; __be64 *tid; int ret, data_len, hdr_len, copy_offset, rmpp_active; @@ -473,28 +528,36 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf, } /* - * If userspace is generating a request that will generate a - * response, we need to make sure the high-order part of the - * transaction ID matches the agent being used to send the - * MAD. + * Set the high-order part of the transaction ID to make MADs from + * different agents unique, and allow routing responses back to the + * original requestor. */ - method = ((struct ib_mad_hdr *) packet->msg->mad)->method; - - if (!(method & IB_MGMT_METHOD_RESP) && - method != IB_MGMT_METHOD_TRAP_REPRESS && - method != IB_MGMT_METHOD_SEND) { + if (!ib_response_mad(packet->msg->mad)) { tid = &((struct ib_mad_hdr *) packet->msg->mad)->tid; *tid = cpu_to_be64(((u64) agent->hi_tid) << 32 | (be64_to_cpup(tid) & 0xffffffff)); + rmpp_mad->mad_hdr.tid = *tid; + } + + spin_lock_irq(&file->send_lock); + ret = is_duplicate(file, packet); + if (!ret) + list_add_tail(&packet->list, &file->send_list); + spin_unlock_irq(&file->send_lock); + if (ret) { + ret = -EINVAL; + goto err_msg; } ret = ib_post_send_mad(packet->msg, NULL); if (ret) - goto err_msg; + goto err_send; up_read(&file->port->mutex); return count; +err_send: + dequeue_send(file, packet); err_msg: ib_free_send_mad(packet->msg); err_ah: @@ -657,7 +720,9 @@ static int ib_umad_open(struct inode *inode, struct file *filp) } spin_lock_init(&file->recv_lock); + spin_lock_init(&file->send_lock); INIT_LIST_HEAD(&file->recv_list); + INIT_LIST_HEAD(&file->send_list); init_waitqueue_head(&file->recv_wait); file->port = port; diff --git a/include/rdma/ib_mad.h b/include/rdma/ib_mad.h index 5ff77558013b..585d28e960dd 100644 --- a/include/rdma/ib_mad.h +++ b/include/rdma/ib_mad.h @@ -75,6 +75,7 @@ #define IB_MGMT_METHOD_TRAP_REPRESS 0x07 #define IB_MGMT_METHOD_RESP 0x80 +#define IB_BM_ATTR_MOD_RESP cpu_to_be32(1) #define IB_MGMT_MAX_METHODS 128 @@ -246,6 +247,12 @@ struct ib_mad_send_buf { int retries; }; +/** + * ib_response_mad - Returns if the specified MAD has been generated in + * response to a sent request or trap. + */ +int ib_response_mad(struct ib_mad *mad); + /** * ib_get_rmpp_resptime - Returns the RMPP response time. * @rmpp_hdr: An RMPP header. From 624d01f899f6bbd75fd06890f231e1f46555d376 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Mon, 24 Jul 2006 10:42:00 +0300 Subject: [PATCH 08/67] IB/ipoib: Fix oops with ipoib_debug_mcast set Need to set mcast->ah before debug code dereferences it. Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/ipoib/ipoib_multicast.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index ab40488182b3..b5e6a7be603d 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -264,6 +264,10 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast, if (!ah) { ipoib_warn(priv, "ib_address_create failed\n"); } else { + spin_lock_irq(&priv->lock); + mcast->ah = ah; + spin_unlock_irq(&priv->lock); + ipoib_dbg_mcast(priv, "MGID " IPOIB_GID_FMT " AV %p, LID 0x%04x, SL %d\n", IPOIB_GID_ARG(mcast->mcmember.mgid), @@ -271,10 +275,6 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast, be16_to_cpu(mcast->mcmember.mlid), mcast->mcmember.sl); } - - spin_lock_irq(&priv->lock); - mcast->ah = ah; - spin_unlock_irq(&priv->lock); } /* actually send any queued packets */ From 8a7f752125a930a83f4d8dfe37fa5a081ab19d31 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 19 Jul 2006 17:44:37 +0300 Subject: [PATCH 09/67] IB/ipoib: Fix packet loss after hardware address update The neighbour ha field may get updated without destroying the neighbour. In this case, the ha field gets out of sync with the address handle stored in ipoib_neigh->ah, with the result that the ah field would point to an incorrect path, resulting in all packets being lost. Signed-off-by: Michael S. Tsirkin Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/ipoib/ipoib.h | 1 + drivers/infiniband/ulp/ipoib/ipoib_main.c | 23 +++++++++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index 3f89f5e19036..474aa214ab57 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -212,6 +212,7 @@ struct ipoib_path { struct ipoib_neigh { struct ipoib_ah *ah; + union ib_gid dgid; struct sk_buff_head queue; struct neighbour *neighbour; diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 1c6ea1c682a5..cf71d2a5515c 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -404,6 +404,8 @@ static void path_rec_completion(int status, list_for_each_entry(neigh, &path->neigh_list, list) { kref_get(&path->ah->ref); neigh->ah = path->ah; + memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw, + sizeof(union ib_gid)); while ((skb = __skb_dequeue(&neigh->queue))) __skb_queue_tail(&skqueue, skb); @@ -510,6 +512,8 @@ static void neigh_add_path(struct sk_buff *skb, struct net_device *dev) if (path->ah) { kref_get(&path->ah->ref); neigh->ah = path->ah; + memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw, + sizeof(union ib_gid)); ipoib_send(dev, skb, path->ah, be32_to_cpup((__be32 *) skb->dst->neighbour->ha)); @@ -633,6 +637,25 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) neigh = *to_ipoib_neigh(skb->dst->neighbour); if (likely(neigh->ah)) { + if (unlikely(memcmp(&neigh->dgid.raw, + skb->dst->neighbour->ha + 4, + sizeof(union ib_gid)))) { + spin_lock(&priv->lock); + /* + * It's safe to call ipoib_put_ah() inside + * priv->lock here, because we know that + * path->ah will always hold one more reference, + * so ipoib_put_ah() will never do more than + * decrement the ref count. + */ + ipoib_put_ah(neigh->ah); + list_del(&neigh->list); + ipoib_neigh_free(neigh); + spin_unlock(&priv->lock); + ipoib_path_lookup(skb, dev); + goto out; + } + ipoib_send(dev, skb, neigh->ah, be32_to_cpup((__be32 *) skb->dst->neighbour->ha)); goto out; From 8fdf679fdb00f588b65abb9c775c178098a05aeb Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Mon, 24 Jul 2006 09:36:50 -0700 Subject: [PATCH 10/67] IB/mthca: Initialize max_cmds before debug code prints it Read the max_cmds value from the response to the QUERY_FW command before printing out the value, so that the real value goes into the debug output. Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mthca/mthca_cmd.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/mthca/mthca_cmd.c b/drivers/infiniband/hw/mthca/mthca_cmd.c index d0f7731802c9..deabc14b4ea4 100644 --- a/drivers/infiniband/hw/mthca/mthca_cmd.c +++ b/drivers/infiniband/hw/mthca/mthca_cmd.c @@ -778,11 +778,12 @@ int mthca_QUERY_FW(struct mthca_dev *dev, u8 *status) ((dev->fw_ver & 0xffff0000ull) >> 16) | ((dev->fw_ver & 0x0000ffffull) << 16); + MTHCA_GET(lg, outbox, QUERY_FW_MAX_CMD_OFFSET); + dev->cmd.max_cmds = 1 << lg; + mthca_dbg(dev, "FW version %012llx, max commands %d\n", (unsigned long long) dev->fw_ver, dev->cmd.max_cmds); - MTHCA_GET(lg, outbox, QUERY_FW_MAX_CMD_OFFSET); - dev->cmd.max_cmds = 1 << lg; MTHCA_GET(dev->catas_err.addr, outbox, QUERY_FW_ERR_START_OFFSET); MTHCA_GET(dev->catas_err.size, outbox, QUERY_FW_ERR_SIZE_OFFSET); From 4b79f0af48d529a360d3529def01835dc5d45fe1 Mon Sep 17 00:00:00 2001 From: Ian McDonald Date: Sun, 23 Jul 2006 23:33:28 -0700 Subject: [PATCH 11/67] [DCCP]: Fix default sequence window size When using the default sequence window size (100) I got the following in my logs: Jun 22 14:24:09 localhost kernel: [ 1492.114775] DCCP: Step 6 failed for DATA packet, (LSWL(6279674225) <= P.seqno(6279674749) <= S.SWH(6279674324)) and (P.ackno doesn't exist or LAWL(18798206530) <= P.ackno(1125899906842620) <= S.AWH(18798206548), sending SYNC... Jun 22 14:24:09 localhost kernel: [ 1492.115147] DCCP: Step 6 failed for DATA packet, (LSWL(6279674225) <= P.seqno(6279674750) <= S.SWH(6279674324)) and (P.ackno doesn't exist or LAWL(18798206530) <= P.ackno(1125899906842620) <= S.AWH(18798206549), sending SYNC... I went to alter the default sysctl and it didn't take for new sockets. Below patch fixes this. I think the default is too low but it is what the DCCP spec specifies. As a side effect of this my rx speed using iperf goes from about 2.8 Mbits/sec to 3.5. This is still far too slow but it is a step in the right direction. Compile tested only for IPv6 but not particularly complex change. Signed off by: Ian McDonald Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/dccp/feat.h | 2 ++ net/dccp/ipv4.c | 3 +-- net/dccp/ipv6.c | 4 ++-- net/dccp/options.c | 2 ++ 4 files changed, 7 insertions(+), 4 deletions(-) diff --git a/net/dccp/feat.h b/net/dccp/feat.h index 6048373c7186..b44c45504fb6 100644 --- a/net/dccp/feat.h +++ b/net/dccp/feat.h @@ -26,4 +26,6 @@ extern void dccp_feat_clean(struct dccp_minisock *dmsk); extern int dccp_feat_clone(struct sock *oldsk, struct sock *newsk); extern int dccp_feat_init(struct dccp_minisock *dmsk); +extern int dccp_feat_default_sequence_window; + #endif /* _DCCP_FEAT_H */ diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index c3073e7e81d3..7f56f7e8f571 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -504,8 +504,7 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) ireq = inet_rsk(req); ireq->loc_addr = daddr; ireq->rmt_addr = saddr; - req->rcv_wnd = 100; /* Fake, option parsing will get the - right value */ + req->rcv_wnd = dccp_feat_default_sequence_window; ireq->opt = NULL; /* diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index ff42bc43263d..9f3d4d7cd0bf 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -31,6 +31,7 @@ #include "dccp.h" #include "ipv6.h" +#include "feat.h" /* Socket used for sending RSTs and ACKs */ static struct socket *dccp_v6_ctl_socket; @@ -707,8 +708,7 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) ireq = inet_rsk(req); ipv6_addr_copy(&ireq6->rmt_addr, &skb->nh.ipv6h->saddr); ipv6_addr_copy(&ireq6->loc_addr, &skb->nh.ipv6h->daddr); - req->rcv_wnd = 100; /* Fake, option parsing will get the - right value */ + req->rcv_wnd = dccp_feat_default_sequence_window; ireq6->pktopts = NULL; if (ipv6_opt_accepted(sk, skb) || diff --git a/net/dccp/options.c b/net/dccp/options.c index c3cda1e39aa8..daf72bb671f0 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -29,6 +29,8 @@ int dccp_feat_default_ack_ratio = DCCPF_INITIAL_ACK_RATIO; int dccp_feat_default_send_ack_vector = DCCPF_INITIAL_SEND_ACK_VECTOR; int dccp_feat_default_send_ndp_count = DCCPF_INITIAL_SEND_NDP_COUNT; +EXPORT_SYMBOL_GPL(dccp_feat_default_sequence_window); + void dccp_minisock_init(struct dccp_minisock *dmsk) { dmsk->dccpms_sequence_window = dccp_feat_default_sequence_window; From 2266d8886f64c66e0a4e61e3e1c19dbc27ed00d4 Mon Sep 17 00:00:00 2001 From: Guillaume Chazarain Date: Sun, 23 Jul 2006 23:37:24 -0700 Subject: [PATCH 12/67] [PKT_SCHED]: Fix regression in PSCHED_TADD{,2}. In PSCHED_TADD and PSCHED_TADD2, if delta is less than tv.tv_usec (so, less than USEC_PER_SEC too) then tv_res will be smaller than tv. The affectation "(tv_res).tv_usec = __delta;" is wrong. The fix is to revert to the original code before 4ee303dfeac6451b402e3d8512723d3a0f861857 and change the 'if' in 'while'. [Shuya MAEDA: "while (__delta >= USEC_PER_SEC){ ... }" instead of "while (__delta > USEC_PER_SEC){ ... }"] Signed-off-by: Guillaume Chazarain Signed-off-by: David S. Miller --- include/net/pkt_sched.h | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index 1925c65e617b..f6afee73235d 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -169,23 +169,17 @@ psched_tod_diff(int delta_sec, int bound) #define PSCHED_TADD2(tv, delta, tv_res) \ ({ \ - int __delta = (delta); \ - (tv_res) = (tv); \ - while(__delta >= USEC_PER_SEC){ \ - (tv_res).tv_sec++; \ - __delta -= USEC_PER_SEC; \ - } \ + int __delta = (tv).tv_usec + (delta); \ + (tv_res).tv_sec = (tv).tv_sec; \ + while (__delta >= USEC_PER_SEC) { (tv_res).tv_sec++; __delta -= USEC_PER_SEC; } \ (tv_res).tv_usec = __delta; \ }) #define PSCHED_TADD(tv, delta) \ ({ \ - int __delta = (delta); \ - while(__delta >= USEC_PER_SEC){ \ - (tv).tv_sec++; \ - __delta -= USEC_PER_SEC; \ - } \ - (tv).tv_usec = __delta; \ + (tv).tv_usec += (delta); \ + while ((tv).tv_usec >= USEC_PER_SEC) { (tv).tv_sec++; \ + (tv).tv_usec -= USEC_PER_SEC; } \ }) /* Set/check that time is in the "past perfect"; From 98bcd08b5bfe78c1c9bda5768aa081e0fe4fcc4f Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Fri, 14 Jul 2006 11:42:12 +0200 Subject: [PATCH 13/67] [Bluetooth] Correct RFCOMM channel MTU for broken implementations Some Bluetooth RFCOMM implementations try to negotiate a bigger channel MTU than we can support for a particular session. The maximum MTU for a RFCOMM session is limited through the L2CAP layer. So if the other side proposes a channel MTU that is bigger than the underlying L2CAP MTU, we should reduce it to the L2CAP MTU of the session minus five bytes for the RFCOMM headers. Signed-off-by: Marcel Holtmann --- net/bluetooth/rfcomm/core.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c index 77eab8f4c7fd..332dd8f436ea 100644 --- a/net/bluetooth/rfcomm/core.c +++ b/net/bluetooth/rfcomm/core.c @@ -55,6 +55,7 @@ #define VERSION "1.8" static int disable_cfc = 0; +static int channel_mtu = -1; static unsigned int l2cap_mtu = RFCOMM_MAX_L2CAP_MTU; static struct task_struct *rfcomm_thread; @@ -812,7 +813,10 @@ static int rfcomm_send_pn(struct rfcomm_session *s, int cr, struct rfcomm_dlc *d pn->credits = 0; } - pn->mtu = htobs(d->mtu); + if (cr && channel_mtu >= 0) + pn->mtu = htobs(channel_mtu); + else + pn->mtu = htobs(d->mtu); *ptr = __fcs(buf); ptr++; @@ -1243,7 +1247,10 @@ static int rfcomm_apply_pn(struct rfcomm_dlc *d, int cr, struct rfcomm_pn *pn) d->priority = pn->priority; - d->mtu = s->mtu = btohs(pn->mtu); + d->mtu = btohs(pn->mtu); + + if (cr && d->mtu > s->mtu) + d->mtu = s->mtu; return 0; } @@ -1770,6 +1777,11 @@ static inline void rfcomm_accept_connection(struct rfcomm_session *s) s = rfcomm_session_add(nsock, BT_OPEN); if (s) { rfcomm_session_hold(s); + + /* We should adjust MTU on incoming sessions. + * L2CAP MTU minus UIH header and FCS. */ + s->mtu = min(l2cap_pi(nsock->sk)->omtu, l2cap_pi(nsock->sk)->imtu) - 5; + rfcomm_schedule(RFCOMM_SCHED_RX); } else sock_release(nsock); @@ -2087,6 +2099,9 @@ module_exit(rfcomm_exit); module_param(disable_cfc, bool, 0644); MODULE_PARM_DESC(disable_cfc, "Disable credit based flow control"); +module_param(channel_mtu, int, 0644); +MODULE_PARM_DESC(channel_mtu, "Default MTU for the RFCOMM channel"); + module_param(l2cap_mtu, uint, 0644); MODULE_PARM_DESC(l2cap_mtu, "Default MTU for the L2CAP connection"); From 520ca78acc652c89c92e8bf29536319afa9d88bb Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Fri, 14 Jul 2006 16:01:52 +0200 Subject: [PATCH 14/67] [Bluetooth] Correct SCO buffer size for another Broadcom chip The SCO buffer size values on IBM/Lenovo ThinkPad laptops with a Bluetooth chip from Broadcom are wrong. The USB Bluetooth driver has to set a quirk to correct the SCO buffer size values. Signed-off-by: Marcel Holtmann --- drivers/bluetooth/hci_usb.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/drivers/bluetooth/hci_usb.c b/drivers/bluetooth/hci_usb.c index 6a0c2230f82f..f4784f49f00a 100644 --- a/drivers/bluetooth/hci_usb.c +++ b/drivers/bluetooth/hci_usb.c @@ -67,6 +67,8 @@ static int ignore = 0; static int ignore_dga = 0; static int ignore_csr = 0; static int ignore_sniffer = 0; +static int disable_scofix = 0; +static int force_scofix = 0; static int reset = 0; #ifdef CONFIG_BT_HCIUSB_SCO @@ -110,6 +112,9 @@ static struct usb_device_id blacklist_ids[] = { { USB_DEVICE(0x0a5c, 0x200a), .driver_info = HCI_RESET | HCI_BROKEN_ISOC }, { USB_DEVICE(0x0a5c, 0x2009), .driver_info = HCI_BCM92035 }, + /* IBM/Lenovo ThinkPad with Broadcom chip */ + { USB_DEVICE(0x0a5c, 0x201e), .driver_info = HCI_WRONG_SCO_MTU }, + /* Microsoft Wireless Transceiver for Bluetooth 2.0 */ { USB_DEVICE(0x045e, 0x009c), .driver_info = HCI_RESET }, @@ -990,8 +995,10 @@ static int hci_usb_probe(struct usb_interface *intf, const struct usb_device_id if (reset || id->driver_info & HCI_RESET) set_bit(HCI_QUIRK_RESET_ON_INIT, &hdev->quirks); - if (id->driver_info & HCI_WRONG_SCO_MTU) - set_bit(HCI_QUIRK_FIXUP_BUFFER_SIZE, &hdev->quirks); + if (force_scofix || id->driver_info & HCI_WRONG_SCO_MTU) { + if (!disable_scofix) + set_bit(HCI_QUIRK_FIXUP_BUFFER_SIZE, &hdev->quirks); + } if (id->driver_info & HCI_SNIFFER) { if (le16_to_cpu(udev->descriptor.bcdDevice) > 0x997) @@ -1161,6 +1168,12 @@ MODULE_PARM_DESC(ignore_csr, "Ignore devices with id 0a12:0001"); module_param(ignore_sniffer, bool, 0644); MODULE_PARM_DESC(ignore_sniffer, "Ignore devices with id 0a12:0002"); +module_param(disable_scofix, bool, 0644); +MODULE_PARM_DESC(disable_scofix, "Disable fixup of wrong SCO buffer size"); + +module_param(force_scofix, bool, 0644); +MODULE_PARM_DESC(force_scofix, "Force fixup of wrong SCO buffers size"); + module_param(reset, bool, 0644); MODULE_PARM_DESC(reset, "Send HCI reset command on initialization"); From ea9727f6e55dabc7a58cf56c87e65665e239e171 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Tue, 18 Jul 2006 17:47:40 +0200 Subject: [PATCH 15/67] [Bluetooth] Correct SCO buffer size for Belkin devices The Belkin F8T012 and F8T013 devices are both based on a Bluetooth chip from Broadcom and their SCO buffer size values are wrong. The Bluetooth core should correct these values. Signed-off-by: Marcel Holtmann --- drivers/bluetooth/hci_usb.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/bluetooth/hci_usb.c b/drivers/bluetooth/hci_usb.c index f4784f49f00a..d73eb10cd3e1 100644 --- a/drivers/bluetooth/hci_usb.c +++ b/drivers/bluetooth/hci_usb.c @@ -127,8 +127,9 @@ static struct usb_device_id blacklist_ids[] = { /* RTX Telecom based adapter with buggy SCO support */ { USB_DEVICE(0x0400, 0x0807), .driver_info = HCI_BROKEN_ISOC }, - /* Belkin F8T012 */ + /* Belkin F8T012 and F8T013 devices */ { USB_DEVICE(0x050d, 0x0012), .driver_info = HCI_WRONG_SCO_MTU }, + { USB_DEVICE(0x050d, 0x0013), .driver_info = HCI_WRONG_SCO_MTU }, /* Digianswer devices */ { USB_DEVICE(0x08fd, 0x0001), .driver_info = HCI_DIGIANSWER }, From 8e4f7230a3bd015862f3af58dc563dbc1cdebfe2 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Tue, 18 Jul 2006 18:04:59 +0200 Subject: [PATCH 16/67] [Bluetooth] Add quirk for another broken RTX Telecom based dongle This patch disables the ISOC transfers for another broken RTX Telecom based USB dongle. Starting the USB ISOC transfers only ends in a burst of error messages for invalid SCO packets on connection handle 0. Signed-off-by: Marcel Holtmann --- drivers/bluetooth/hci_usb.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/bluetooth/hci_usb.c b/drivers/bluetooth/hci_usb.c index d73eb10cd3e1..963c01459cc8 100644 --- a/drivers/bluetooth/hci_usb.c +++ b/drivers/bluetooth/hci_usb.c @@ -124,8 +124,9 @@ static struct usb_device_id blacklist_ids[] = { /* ISSC Bluetooth Adapter v3.1 */ { USB_DEVICE(0x1131, 0x1001), .driver_info = HCI_RESET }, - /* RTX Telecom based adapter with buggy SCO support */ + /* RTX Telecom based adapters with buggy SCO support */ { USB_DEVICE(0x0400, 0x0807), .driver_info = HCI_BROKEN_ISOC }, + { USB_DEVICE(0x0400, 0x080a), .driver_info = HCI_BROKEN_ISOC }, /* Belkin F8T012 and F8T013 devices */ { USB_DEVICE(0x050d, 0x0012), .driver_info = HCI_WRONG_SCO_MTU }, From e9e9290f5c85887baf1123a36ec9fdf56a10cf4b Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Tue, 18 Jul 2006 18:32:33 +0200 Subject: [PATCH 17/67] [Bluetooth] Enable SCO support for Broadcom HID proxy dongle The Broadcom dongles with HID proxy support actually support SCO over HCI if the SCO buffer size values are corrected. So instead of disabling the SCO support, mark this dongle with the quirk for the Bluetooth core to correct the wrong buffer size values. Signed-off-by: Marcel Holtmann --- drivers/bluetooth/hci_usb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/bluetooth/hci_usb.c b/drivers/bluetooth/hci_usb.c index 963c01459cc8..e2d4beac7420 100644 --- a/drivers/bluetooth/hci_usb.c +++ b/drivers/bluetooth/hci_usb.c @@ -109,7 +109,7 @@ static struct usb_device_id blacklist_ids[] = { { USB_DEVICE(0x0a5c, 0x2033), .driver_info = HCI_IGNORE }, /* Broadcom BCM2035 */ - { USB_DEVICE(0x0a5c, 0x200a), .driver_info = HCI_RESET | HCI_BROKEN_ISOC }, + { USB_DEVICE(0x0a5c, 0x200a), .driver_info = HCI_RESET | HCI_WRONG_SCO_MTU }, { USB_DEVICE(0x0a5c, 0x2009), .driver_info = HCI_BCM92035 }, /* IBM/Lenovo ThinkPad with Broadcom chip */ From a922ba5510530ae8e3c60edc85c56f72347a3c86 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 24 Jul 2006 13:49:06 -0700 Subject: [PATCH 18/67] [IPV6] xfrm6_tunnel: Delete debugging code. It doesn't compile, and it's dubious in several regards: 1) is enabled by non-Kconfig controlled CONFIG_* value (noted by Randy Dunlap) 2) XFRM6_TUNNEL_SPI_MAGIC is defined after it's first use 3) the debugging messages print object pointer addresses which have no meaning without context So let's just get rid of it. Signed-off-by: David S. Miller --- net/ipv6/xfrm6_tunnel.c | 140 +++------------------------------------- 1 file changed, 10 insertions(+), 130 deletions(-) diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c index 6b44fe8516c3..c8f9369c2a87 100644 --- a/net/ipv6/xfrm6_tunnel.c +++ b/net/ipv6/xfrm6_tunnel.c @@ -31,27 +31,6 @@ #include #include -#ifdef CONFIG_IPV6_XFRM6_TUNNEL_DEBUG -# define X6TDEBUG 3 -#else -# define X6TDEBUG 1 -#endif - -#define X6TPRINTK(fmt, args...) printk(fmt, ## args) -#define X6TNOPRINTK(fmt, args...) do { ; } while(0) - -#if X6TDEBUG >= 1 -# define X6TPRINTK1 X6TPRINTK -#else -# define X6TPRINTK1 X6TNOPRINTK -#endif - -#if X6TDEBUG >= 3 -# define X6TPRINTK3 X6TPRINTK -#else -# define X6TPRINTK3 X6TNOPRINTK -#endif - /* * xfrm_tunnel_spi things are for allocating unique id ("spi") * per xfrm_address_t. @@ -62,15 +41,8 @@ struct xfrm6_tunnel_spi { xfrm_address_t addr; u32 spi; atomic_t refcnt; -#ifdef XFRM6_TUNNEL_SPI_MAGIC - u32 magic; -#endif }; -#ifdef CONFIG_IPV6_XFRM6_TUNNEL_DEBUG -# define XFRM6_TUNNEL_SPI_MAGIC 0xdeadbeef -#endif - static DEFINE_RWLOCK(xfrm6_tunnel_spi_lock); static u32 xfrm6_tunnel_spi; @@ -86,43 +58,15 @@ static kmem_cache_t *xfrm6_tunnel_spi_kmem __read_mostly; static struct hlist_head xfrm6_tunnel_spi_byaddr[XFRM6_TUNNEL_SPI_BYADDR_HSIZE]; static struct hlist_head xfrm6_tunnel_spi_byspi[XFRM6_TUNNEL_SPI_BYSPI_HSIZE]; -#ifdef XFRM6_TUNNEL_SPI_MAGIC -static int x6spi_check_magic(const struct xfrm6_tunnel_spi *x6spi, - const char *name) -{ - if (unlikely(x6spi->magic != XFRM6_TUNNEL_SPI_MAGIC)) { - X6TPRINTK3(KERN_DEBUG "%s(): x6spi object " - "at %p has corrupted magic %08x " - "(should be %08x)\n", - name, x6spi, x6spi->magic, XFRM6_TUNNEL_SPI_MAGIC); - return -1; - } - return 0; -} -#else -static int inline x6spi_check_magic(const struct xfrm6_tunnel_spi *x6spi, - const char *name) -{ - return 0; -} -#endif - -#define X6SPI_CHECK_MAGIC(x6spi) x6spi_check_magic((x6spi), __FUNCTION__) - - static unsigned inline xfrm6_tunnel_spi_hash_byaddr(xfrm_address_t *addr) { unsigned h; - X6TPRINTK3(KERN_DEBUG "%s(addr=%p)\n", __FUNCTION__, addr); - h = addr->a6[0] ^ addr->a6[1] ^ addr->a6[2] ^ addr->a6[3]; h ^= h >> 16; h ^= h >> 8; h &= XFRM6_TUNNEL_SPI_BYADDR_HSIZE - 1; - X6TPRINTK3(KERN_DEBUG "%s() = %u\n", __FUNCTION__, h); - return h; } @@ -136,19 +80,13 @@ static int xfrm6_tunnel_spi_init(void) { int i; - X6TPRINTK3(KERN_DEBUG "%s()\n", __FUNCTION__); - xfrm6_tunnel_spi = 0; xfrm6_tunnel_spi_kmem = kmem_cache_create("xfrm6_tunnel_spi", sizeof(struct xfrm6_tunnel_spi), 0, SLAB_HWCACHE_ALIGN, NULL, NULL); - if (!xfrm6_tunnel_spi_kmem) { - X6TPRINTK1(KERN_ERR - "%s(): failed to allocate xfrm6_tunnel_spi_kmem\n", - __FUNCTION__); + if (!xfrm6_tunnel_spi_kmem) return -ENOMEM; - } for (i = 0; i < XFRM6_TUNNEL_SPI_BYADDR_HSIZE; i++) INIT_HLIST_HEAD(&xfrm6_tunnel_spi_byaddr[i]); @@ -161,22 +99,16 @@ static void xfrm6_tunnel_spi_fini(void) { int i; - X6TPRINTK3(KERN_DEBUG "%s()\n", __FUNCTION__); - for (i = 0; i < XFRM6_TUNNEL_SPI_BYADDR_HSIZE; i++) { if (!hlist_empty(&xfrm6_tunnel_spi_byaddr[i])) - goto err; + return; } for (i = 0; i < XFRM6_TUNNEL_SPI_BYSPI_HSIZE; i++) { if (!hlist_empty(&xfrm6_tunnel_spi_byspi[i])) - goto err; + return; } kmem_cache_destroy(xfrm6_tunnel_spi_kmem); xfrm6_tunnel_spi_kmem = NULL; - return; -err: - X6TPRINTK1(KERN_ERR "%s(): table is not empty\n", __FUNCTION__); - return; } static struct xfrm6_tunnel_spi *__xfrm6_tunnel_spi_lookup(xfrm_address_t *saddr) @@ -184,19 +116,13 @@ static struct xfrm6_tunnel_spi *__xfrm6_tunnel_spi_lookup(xfrm_address_t *saddr) struct xfrm6_tunnel_spi *x6spi; struct hlist_node *pos; - X6TPRINTK3(KERN_DEBUG "%s(saddr=%p)\n", __FUNCTION__, saddr); - hlist_for_each_entry(x6spi, pos, &xfrm6_tunnel_spi_byaddr[xfrm6_tunnel_spi_hash_byaddr(saddr)], list_byaddr) { - if (memcmp(&x6spi->addr, saddr, sizeof(x6spi->addr)) == 0) { - X6SPI_CHECK_MAGIC(x6spi); - X6TPRINTK3(KERN_DEBUG "%s() = %p(%u)\n", __FUNCTION__, x6spi, x6spi->spi); + if (memcmp(&x6spi->addr, saddr, sizeof(x6spi->addr)) == 0) return x6spi; - } } - X6TPRINTK3(KERN_DEBUG "%s() = NULL(0)\n", __FUNCTION__); return NULL; } @@ -205,8 +131,6 @@ u32 xfrm6_tunnel_spi_lookup(xfrm_address_t *saddr) struct xfrm6_tunnel_spi *x6spi; u32 spi; - X6TPRINTK3(KERN_DEBUG "%s(saddr=%p)\n", __FUNCTION__, saddr); - read_lock_bh(&xfrm6_tunnel_spi_lock); x6spi = __xfrm6_tunnel_spi_lookup(saddr); spi = x6spi ? x6spi->spi : 0; @@ -223,8 +147,6 @@ static u32 __xfrm6_tunnel_alloc_spi(xfrm_address_t *saddr) struct hlist_node *pos; unsigned index; - X6TPRINTK3(KERN_DEBUG "%s(saddr=%p)\n", __FUNCTION__, saddr); - if (xfrm6_tunnel_spi < XFRM6_TUNNEL_SPI_MIN || xfrm6_tunnel_spi >= XFRM6_TUNNEL_SPI_MAX) xfrm6_tunnel_spi = XFRM6_TUNNEL_SPI_MIN; @@ -258,18 +180,10 @@ try_next_2:; spi = 0; goto out; alloc_spi: - X6TPRINTK3(KERN_DEBUG "%s(): allocate new spi for " NIP6_FMT "\n", - __FUNCTION__, - NIP6(*(struct in6_addr *)saddr)); x6spi = kmem_cache_alloc(xfrm6_tunnel_spi_kmem, SLAB_ATOMIC); - if (!x6spi) { - X6TPRINTK1(KERN_ERR "%s(): kmem_cache_alloc() failed\n", - __FUNCTION__); + if (!x6spi) goto out; - } -#ifdef XFRM6_TUNNEL_SPI_MAGIC - x6spi->magic = XFRM6_TUNNEL_SPI_MAGIC; -#endif + memcpy(&x6spi->addr, saddr, sizeof(x6spi->addr)); x6spi->spi = spi; atomic_set(&x6spi->refcnt, 1); @@ -278,9 +192,7 @@ try_next_2:; index = xfrm6_tunnel_spi_hash_byaddr(saddr); hlist_add_head(&x6spi->list_byaddr, &xfrm6_tunnel_spi_byaddr[index]); - X6SPI_CHECK_MAGIC(x6spi); out: - X6TPRINTK3(KERN_DEBUG "%s() = %u\n", __FUNCTION__, spi); return spi; } @@ -289,8 +201,6 @@ u32 xfrm6_tunnel_alloc_spi(xfrm_address_t *saddr) struct xfrm6_tunnel_spi *x6spi; u32 spi; - X6TPRINTK3(KERN_DEBUG "%s(saddr=%p)\n", __FUNCTION__, saddr); - write_lock_bh(&xfrm6_tunnel_spi_lock); x6spi = __xfrm6_tunnel_spi_lookup(saddr); if (x6spi) { @@ -300,8 +210,6 @@ u32 xfrm6_tunnel_alloc_spi(xfrm_address_t *saddr) spi = __xfrm6_tunnel_alloc_spi(saddr); write_unlock_bh(&xfrm6_tunnel_spi_lock); - X6TPRINTK3(KERN_DEBUG "%s() = %u\n", __FUNCTION__, spi); - return spi; } @@ -312,8 +220,6 @@ void xfrm6_tunnel_free_spi(xfrm_address_t *saddr) struct xfrm6_tunnel_spi *x6spi; struct hlist_node *pos, *n; - X6TPRINTK3(KERN_DEBUG "%s(saddr=%p)\n", __FUNCTION__, saddr); - write_lock_bh(&xfrm6_tunnel_spi_lock); hlist_for_each_entry_safe(x6spi, pos, n, @@ -321,12 +227,6 @@ void xfrm6_tunnel_free_spi(xfrm_address_t *saddr) list_byaddr) { if (memcmp(&x6spi->addr, saddr, sizeof(x6spi->addr)) == 0) { - X6TPRINTK3(KERN_DEBUG "%s(): x6spi object for " NIP6_FMT - " found at %p\n", - __FUNCTION__, - NIP6(*(struct in6_addr *)saddr), - x6spi); - X6SPI_CHECK_MAGIC(x6spi); if (atomic_dec_and_test(&x6spi->refcnt)) { hlist_del(&x6spi->list_byaddr); hlist_del(&x6spi->list_byspi); @@ -377,20 +277,14 @@ static int xfrm6_tunnel_err(struct sk_buff *skb, struct inet6_skb_parm *opt, case ICMPV6_ADDR_UNREACH: case ICMPV6_PORT_UNREACH: default: - X6TPRINTK3(KERN_DEBUG - "xfrm6_tunnel: Destination Unreach.\n"); break; } break; case ICMPV6_PKT_TOOBIG: - X6TPRINTK3(KERN_DEBUG - "xfrm6_tunnel: Packet Too Big.\n"); break; case ICMPV6_TIME_EXCEED: switch (code) { case ICMPV6_EXC_HOPLIMIT: - X6TPRINTK3(KERN_DEBUG - "xfrm6_tunnel: Too small Hoplimit.\n"); break; case ICMPV6_EXC_FRAGTIME: default: @@ -447,22 +341,14 @@ static struct xfrm6_tunnel xfrm6_tunnel_handler = { static int __init xfrm6_tunnel_init(void) { - X6TPRINTK3(KERN_DEBUG "%s()\n", __FUNCTION__); - - if (xfrm_register_type(&xfrm6_tunnel_type, AF_INET6) < 0) { - X6TPRINTK1(KERN_ERR - "xfrm6_tunnel init: can't add xfrm type\n"); + if (xfrm_register_type(&xfrm6_tunnel_type, AF_INET6) < 0) return -EAGAIN; - } + if (xfrm6_tunnel_register(&xfrm6_tunnel_handler)) { - X6TPRINTK1(KERN_ERR - "xfrm6_tunnel init(): can't add handler\n"); xfrm_unregister_type(&xfrm6_tunnel_type, AF_INET6); return -EAGAIN; } if (xfrm6_tunnel_spi_init() < 0) { - X6TPRINTK1(KERN_ERR - "xfrm6_tunnel init: failed to initialize spi\n"); xfrm6_tunnel_deregister(&xfrm6_tunnel_handler); xfrm_unregister_type(&xfrm6_tunnel_type, AF_INET6); return -EAGAIN; @@ -472,15 +358,9 @@ static int __init xfrm6_tunnel_init(void) static void __exit xfrm6_tunnel_fini(void) { - X6TPRINTK3(KERN_DEBUG "%s()\n", __FUNCTION__); - xfrm6_tunnel_spi_fini(); - if (xfrm6_tunnel_deregister(&xfrm6_tunnel_handler)) - X6TPRINTK1(KERN_ERR - "xfrm6_tunnel close: can't remove handler\n"); - if (xfrm_unregister_type(&xfrm6_tunnel_type, AF_INET6) < 0) - X6TPRINTK1(KERN_ERR - "xfrm6_tunnel close: can't remove xfrm type\n"); + xfrm6_tunnel_deregister(&xfrm6_tunnel_handler); + xfrm_unregister_type(&xfrm6_tunnel_type, AF_INET6); } module_init(xfrm6_tunnel_init); From 6c753c3d3be0f8d1b570ec5720ad1bb4caf8232b Mon Sep 17 00:00:00 2001 From: Stefan Rompf Date: Mon, 24 Jul 2006 13:52:13 -0700 Subject: [PATCH 19/67] [VLAN]: Fix link state propagation When the queue of the underlying device is stopped at initialization time or the device is marked "not present", the state will be propagated to the vlan device and never change. Based on an analysis by Patrick McHardy. Signed-off-by: Stefan Rompf ACKed-by: Patrick McHardy Signed-off-by: David S. Miller --- net/8021q/vlan.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 0ade0c63fdf6..18fcb9fa518d 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -67,10 +67,6 @@ static struct packet_type vlan_packet_type = { .func = vlan_skb_recv, /* VLAN receive method */ }; -/* Bits of netdev state that are propagated from real device to virtual */ -#define VLAN_LINK_STATE_MASK \ - ((1<<__LINK_STATE_PRESENT)|(1<<__LINK_STATE_NOCARRIER)|(1<<__LINK_STATE_DORMANT)) - /* End of global variables definitions. */ /* @@ -479,7 +475,9 @@ static struct net_device *register_vlan_device(const char *eth_IF_name, new_dev->flags = real_dev->flags; new_dev->flags &= ~IFF_UP; - new_dev->state = real_dev->state & ~(1<<__LINK_STATE_START); + new_dev->state = (real_dev->state & ((1<<__LINK_STATE_NOCARRIER) | + (1<<__LINK_STATE_DORMANT))) | + (1<<__LINK_STATE_PRESENT); /* need 4 bytes for extra VLAN header info, * hope the underlying device can handle it. From 37182d1bd3264cf9c0dce3408bee48af0755de7e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 24 Jul 2006 15:30:28 -0700 Subject: [PATCH 20/67] [NET]: Remove CONFIG_HAVE_ARCH_DEV_ALLOC_SKB skbuff.h has an #ifndef CONFIG_HAVE_ARCH_DEV_ALLOC_SKB to allow architectures to reimplement __dev_alloc_skb. It's not set on any architecture and now that we have an architecture-overrideable NET_SKB_PAD there is not point at all to have one either. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- include/linux/skbuff.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 0bf31b83578c..f98757976647 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1066,7 +1066,6 @@ static inline void __skb_queue_purge(struct sk_buff_head *list) kfree_skb(skb); } -#ifndef CONFIG_HAVE_ARCH_DEV_ALLOC_SKB /** * __dev_alloc_skb - allocate an skbuff for sending * @length: length to allocate @@ -1087,9 +1086,6 @@ static inline struct sk_buff *__dev_alloc_skb(unsigned int length, skb_reserve(skb, NET_SKB_PAD); return skb; } -#else -extern struct sk_buff *__dev_alloc_skb(unsigned int length, int gfp_mask); -#endif /** * dev_alloc_skb - allocate an skbuff for sending From b4e54de8d34afe7fcf08bfe91070d9dfeae6ed27 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 24 Jul 2006 15:31:14 -0700 Subject: [PATCH 21/67] [NET]: Correct dev_alloc_skb kerneldoc dev_alloc_skb is designated for RX descriptors, not TX. (Some drivers use it for the latter anyway, but that's a different story) Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- include/linux/skbuff.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index f98757976647..4307e764ef0a 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1067,7 +1067,7 @@ static inline void __skb_queue_purge(struct sk_buff_head *list) } /** - * __dev_alloc_skb - allocate an skbuff for sending + * __dev_alloc_skb - allocate an skbuff for receiving * @length: length to allocate * @gfp_mask: get_free_pages mask, passed to alloc_skb * @@ -1088,7 +1088,7 @@ static inline struct sk_buff *__dev_alloc_skb(unsigned int length, } /** - * dev_alloc_skb - allocate an skbuff for sending + * dev_alloc_skb - allocate an skbuff for receiving * @length: length to allocate * * Allocate a new &sk_buff and assign it a usage count of one. The From eb398d1044e0c1c19c2f5041acdb29ddb5bbc9f8 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sat, 22 Jul 2006 01:12:09 -0700 Subject: [PATCH 22/67] [SPARC64]: Explicitly print return PC when the kernel fault PC is bogus. That way we'll have at least some debugging info even if the stack dump explodes. Signed-off-by: David S. Miller --- arch/sparc64/mm/fault.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/sparc64/mm/fault.c b/arch/sparc64/mm/fault.c index 1605967cce91..55ae802dc0ad 100644 --- a/arch/sparc64/mm/fault.c +++ b/arch/sparc64/mm/fault.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -132,6 +133,8 @@ static void bad_kernel_pc(struct pt_regs *regs, unsigned long vaddr) printk(KERN_CRIT "OOPS: Bogus kernel PC [%016lx] in fault handler\n", regs->tpc); + printk(KERN_CRIT "OOPS: RPC [%016lx]\n", regs->u_regs[15]); + print_symbol("RPC: <%s>\n", regs->u_regs[15]); printk(KERN_CRIT "OOPS: Fault was to vaddr[%lx]\n", vaddr); __asm__("mov %%sp, %0" : "=r" (ksp)); show_stack(current, ksp); From 29ed46015dd61f99d203ec7ab307ccf92d2d0cf2 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sat, 22 Jul 2006 02:05:07 -0700 Subject: [PATCH 23/67] [SPARC]: Fix SA_STATIC_ALLOC value. It alises IRQF_SHARED which causes all kinds of problems. Signed-off-by: David S. Miller --- include/asm-sparc/signal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/asm-sparc/signal.h b/include/asm-sparc/signal.h index 0ae5084c427b..d03a21c97abb 100644 --- a/include/asm-sparc/signal.h +++ b/include/asm-sparc/signal.h @@ -168,7 +168,7 @@ struct sigstack { * statically allocated data.. which is NOT GOOD. * */ -#define SA_STATIC_ALLOC 0x80 +#define SA_STATIC_ALLOC 0x8000 #endif #include From 6bc063d414a815937fc81449fa9ffe8d3a4cdf22 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 24 Jul 2006 22:47:14 -0700 Subject: [PATCH 24/67] [SCSI] esp: Fix build. The data_cmd[] member got deleted, so do not use it any more. Scsi commands do not have their ->cmd[] overwritten temporary to probe for status after an error before retrying. Signed-off-by: David S. Miller --- drivers/scsi/esp.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/drivers/scsi/esp.c b/drivers/scsi/esp.c index eaf64c7e54e7..98bd22714d0d 100644 --- a/drivers/scsi/esp.c +++ b/drivers/scsi/esp.c @@ -2754,18 +2754,15 @@ static int esp_do_data_finale(struct esp *esp) */ static int esp_should_clear_sync(struct scsi_cmnd *sp) { - u8 cmd1 = sp->cmnd[0]; - u8 cmd2 = sp->data_cmnd[0]; + u8 cmd = sp->cmnd[0]; /* These cases are for spinning up a disk and * waiting for that spinup to complete. */ - if (cmd1 == START_STOP || - cmd2 == START_STOP) + if (cmd == START_STOP) return 0; - if (cmd1 == TEST_UNIT_READY || - cmd2 == TEST_UNIT_READY) + if (cmd == TEST_UNIT_READY) return 0; /* One more special case for SCSI tape drives, @@ -2773,8 +2770,7 @@ static int esp_should_clear_sync(struct scsi_cmnd *sp) * completion of a rewind or tape load operation. */ if (sp->device->type == TYPE_TAPE) { - if (cmd1 == MODE_SENSE || - cmd2 == MODE_SENSE) + if (cmd == MODE_SENSE) return 0; } From 083edca05ab1fa6efac1ba414018f7f45a4a83ff Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 24 Jul 2006 22:52:10 -0700 Subject: [PATCH 25/67] [NETFILTER]: H.323 helper: fix possible NULL-ptr dereference An RCF message containing a timeout results in a NULL-ptr dereference if no RRQ has been seen before. Noticed by the "SATURN tool", reported by Thomas Dillig and Isil Dillig . Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_helper_h323.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/netfilter/ip_conntrack_helper_h323.c b/net/ipv4/netfilter/ip_conntrack_helper_h323.c index af35235672d5..9a39e2969712 100644 --- a/net/ipv4/netfilter/ip_conntrack_helper_h323.c +++ b/net/ipv4/netfilter/ip_conntrack_helper_h323.c @@ -1200,7 +1200,7 @@ static struct ip_conntrack_expect *find_expect(struct ip_conntrack *ct, tuple.dst.protonum = IPPROTO_TCP; exp = __ip_conntrack_expect_find(&tuple); - if (exp->master == ct) + if (exp && exp->master == ct) return exp; return NULL; } From 3bc38712e3a6e0596ccb6f8299043a826f983701 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 24 Jul 2006 22:52:47 -0700 Subject: [PATCH 26/67] [NETFILTER]: nf_queue: handle NF_STOP and unknown verdicts in nf_reinject In case of an unknown verdict or NF_STOP the packet leaks. Unknown verdicts can happen when userspace is buggy. Reinject the packet in case of NF_STOP, drop on unknown verdicts. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/netfilter/nf_queue.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index bb6fcee452ca..662a869593bf 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -219,21 +219,20 @@ void nf_reinject(struct sk_buff *skb, struct nf_info *info, switch (verdict & NF_VERDICT_MASK) { case NF_ACCEPT: + case NF_STOP: info->okfn(skb); + case NF_STOLEN: break; - case NF_QUEUE: if (!nf_queue(&skb, elem, info->pf, info->hook, info->indev, info->outdev, info->okfn, verdict >> NF_VERDICT_BITS)) goto next_hook; break; + default: + kfree_skb(skb); } rcu_read_unlock(); - - if (verdict == NF_DROP) - kfree_skb(skb); - kfree(info); return; } From 72b558235950538da8bf5a8de746a194831c6fe6 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Mon, 24 Jul 2006 22:53:12 -0700 Subject: [PATCH 27/67] [NETFILTER]: conntrack: fix SYSCTL=n compile Signed-off-by: Adrian Bunk Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_standalone.c | 4 ++-- net/netfilter/nf_conntrack_standalone.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c index 7bd3c22003a2..7a9fa04a467a 100644 --- a/net/ipv4/netfilter/ip_conntrack_standalone.c +++ b/net/ipv4/netfilter/ip_conntrack_standalone.c @@ -534,6 +534,8 @@ static struct nf_hook_ops ip_conntrack_ops[] = { /* Sysctl support */ +int ip_conntrack_checksum = 1; + #ifdef CONFIG_SYSCTL /* From ip_conntrack_core.c */ @@ -568,8 +570,6 @@ extern unsigned int ip_ct_generic_timeout; static int log_invalid_proto_min = 0; static int log_invalid_proto_max = 255; -int ip_conntrack_checksum = 1; - static struct ctl_table_header *ip_ct_sysctl_header; static ctl_table ip_ct_sysctl_table[] = { diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 5fcab2ef231f..4ef836699962 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -428,6 +428,8 @@ static struct file_operations ct_cpu_seq_fops = { /* Sysctl support */ +int nf_conntrack_checksum = 1; + #ifdef CONFIG_SYSCTL /* From nf_conntrack_core.c */ @@ -459,8 +461,6 @@ extern unsigned int nf_ct_generic_timeout; static int log_invalid_proto_min = 0; static int log_invalid_proto_max = 255; -int nf_conntrack_checksum = 1; - static struct ctl_table_header *nf_ct_sysctl_header; static ctl_table nf_ct_sysctl_table[] = { From 8cf8fb5687bb37737ea419a0b2143aab49295779 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 24 Jul 2006 22:53:35 -0700 Subject: [PATCH 28/67] [NETFILTER]: SNMP NAT: fix byteorder confusion Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_nat_snmp_basic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/netfilter/ip_nat_snmp_basic.c b/net/ipv4/netfilter/ip_nat_snmp_basic.c index 0b1b416759cc..18b7fbdccb61 100644 --- a/net/ipv4/netfilter/ip_nat_snmp_basic.c +++ b/net/ipv4/netfilter/ip_nat_snmp_basic.c @@ -1255,9 +1255,9 @@ static int help(struct sk_buff **pskb, struct udphdr *udph = (struct udphdr *)((u_int32_t *)iph + iph->ihl); /* SNMP replies and originating SNMP traps get mangled */ - if (udph->source == ntohs(SNMP_PORT) && dir != IP_CT_DIR_REPLY) + if (udph->source == htons(SNMP_PORT) && dir != IP_CT_DIR_REPLY) return NF_ACCEPT; - if (udph->dest == ntohs(SNMP_TRAP_PORT) && dir != IP_CT_DIR_ORIGINAL) + if (udph->dest == htons(SNMP_TRAP_PORT) && dir != IP_CT_DIR_ORIGINAL) return NF_ACCEPT; /* No NAT? */ From 28658c8967da9083be83af0a37be3b190bae79da Mon Sep 17 00:00:00 2001 From: Phil Oester Date: Mon, 24 Jul 2006 22:54:14 -0700 Subject: [PATCH 29/67] [NETFILTER]: xt_pkttype: fix mismatches on locally generated packets Locally generated broadcast and multicast packets have pkttype set to PACKET_LOOPBACK instead of PACKET_BROADCAST or PACKET_MULTICAST. This causes the pkttype match to fail to match packets of either type. The below patch remedies this by using the daddr as a hint as to broadcast|multicast. While not pretty, this seems like the only way to solve the problem short of just noting this as a limitation of the match. This resolves netfilter bugzilla #484 Signed-off-by: Phil Oester Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/netfilter/xt_pkttype.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/net/netfilter/xt_pkttype.c b/net/netfilter/xt_pkttype.c index 3ac703b5cb8f..d2f5320a80bf 100644 --- a/net/netfilter/xt_pkttype.c +++ b/net/netfilter/xt_pkttype.c @@ -9,6 +9,8 @@ #include #include #include +#include +#include #include #include @@ -28,9 +30,17 @@ static int match(const struct sk_buff *skb, unsigned int protoff, int *hotdrop) { + u_int8_t type; const struct xt_pkttype_info *info = matchinfo; - return (skb->pkt_type == info->pkttype) ^ info->invert; + if (skb->pkt_type == PACKET_LOOPBACK) + type = (MULTICAST(skb->nh.iph->daddr) + ? PACKET_MULTICAST + : PACKET_BROADCAST); + else + type = skb->pkt_type; + + return (type == info->pkttype) ^ info->invert; } static struct xt_match pkttype_match = { From 10ea6ac895418bd0d23900e3330daa6ba0836d26 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 24 Jul 2006 22:54:55 -0700 Subject: [PATCH 30/67] [NETFILTER]: bridge netfilter: add deferred output hooks to feature-removal-schedule Add bridge netfilter deferred output hooks to feature-removal-schedule and disable them by default. Until their removal they will be activated by the physdev match when needed. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- Documentation/feature-removal-schedule.txt | 16 ++++++++++++++++ include/linux/netfilter_bridge.h | 2 ++ net/bridge/br_netfilter.c | 5 +++++ net/netfilter/xt_physdev.c | 15 +++++++++++++++ 4 files changed, 38 insertions(+) diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index 9d3a0775a11d..87851efb0228 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -258,3 +258,19 @@ Why: These drivers never compiled since they were added to the kernel Who: Jean Delvare --------------------------- + +What: Bridge netfilter deferred IPv4/IPv6 output hook calling +When: January 2007 +Why: The deferred output hooks are a layering violation causing unusual + and broken behaviour on bridge devices. Examples of things they + break include QoS classifation using the MARK or CLASSIFY targets, + the IPsec policy match and connection tracking with VLANs on a + bridge. Their only use is to enable bridge output port filtering + within iptables with the physdev match, which can also be done by + combining iptables and ebtables using netfilter marks. Until it + will get removed the hook deferral is disabled by default and is + only enabled when needed. + +Who: Patrick McHardy + +--------------------------- diff --git a/include/linux/netfilter_bridge.h b/include/linux/netfilter_bridge.h index 87764022cc67..31f02ba036ce 100644 --- a/include/linux/netfilter_bridge.h +++ b/include/linux/netfilter_bridge.h @@ -79,6 +79,8 @@ struct bridge_skb_cb { __u32 ipv4; } daddr; }; + +extern int brnf_deferred_hooks; #endif /* CONFIG_BRIDGE_NETFILTER */ #endif /* __KERNEL__ */ diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index cbc8a389a0a8..05b3de888243 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -61,6 +61,9 @@ static int brnf_filter_vlan_tagged = 1; #define brnf_filter_vlan_tagged 1 #endif +int brnf_deferred_hooks; +EXPORT_SYMBOL_GPL(brnf_deferred_hooks); + static __be16 inline vlan_proto(const struct sk_buff *skb) { return vlan_eth_hdr(skb)->h_vlan_encapsulated_proto; @@ -890,6 +893,8 @@ static unsigned int ip_sabotage_out(unsigned int hook, struct sk_buff **pskb, return NF_ACCEPT; else if (ip->version == 6 && !brnf_call_ip6tables) return NF_ACCEPT; + else if (!brnf_deferred_hooks) + return NF_ACCEPT; #endif if (hook == NF_IP_POST_ROUTING) return NF_ACCEPT; diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c index 5fe4c9df17f5..a9f4f6f3c628 100644 --- a/net/netfilter/xt_physdev.c +++ b/net/netfilter/xt_physdev.c @@ -113,6 +113,21 @@ checkentry(const char *tablename, if (!(info->bitmask & XT_PHYSDEV_OP_MASK) || info->bitmask & ~XT_PHYSDEV_OP_MASK) return 0; + if (brnf_deferred_hooks == 0 && + info->bitmask & XT_PHYSDEV_OP_OUT && + (!(info->bitmask & XT_PHYSDEV_OP_BRIDGED) || + info->invert & XT_PHYSDEV_OP_BRIDGED) && + hook_mask & ((1 << NF_IP_LOCAL_OUT) | (1 << NF_IP_FORWARD) | + (1 << NF_IP_POST_ROUTING))) { + printk(KERN_WARNING "physdev match: using --physdev-out in the " + "OUTPUT, FORWARD and POSTROUTING chains for non-bridged " + "traffic is deprecated and breaks other things, it will " + "be removed in January 2007. See Documentation/" + "feature-removal-schedule.txt for details. This doesn't " + "affect you in case you're using it for purely bridged " + "traffic.\n"); + brnf_deferred_hooks = 1; + } return 1; } From d5af981e93aff0de5ad2a1a9935a3f6aa5cd3e3c Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 24 Jul 2006 22:55:29 -0700 Subject: [PATCH 31/67] [NETFILTER]: Demote xt_sctp to EXPERIMENTAL After the recent problems with all the SCTP stuff it seems reasonable to mark this as experimental. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/netfilter/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 42a178aa30f9..a9894ddfd72a 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -386,8 +386,8 @@ config NETFILTER_XT_MATCH_REALM . If unsure, say `N'. config NETFILTER_XT_MATCH_SCTP - tristate '"sctp" protocol match support' - depends on NETFILTER_XTABLES + tristate '"sctp" protocol match support (EXPERIMENTAL)' + depends on NETFILTER_XTABLES && EXPERIMENTAL help With this option enabled, you will be able to use the `sctp' match in order to match on SCTP source/destination ports From 6b7fdc3ae18a0598a999156b62d55ea55220e00f Mon Sep 17 00:00:00 2001 From: Guillaume Chazarain Date: Mon, 24 Jul 2006 23:44:44 -0700 Subject: [PATCH 32/67] [IPV6]: Clean skb cb on IPv6 input. Clear the accumulated junk in IP6CB when starting to handle an IPV6 packet. Signed-off-by: Guillaume Chazarain Signed-off-by: David S. Miller --- net/ipv6/ip6_input.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index df8f051c0fce..25c2a9e03895 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -71,6 +71,8 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt goto out; } + memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm)); + /* * Store incoming device index. When the packet will * be queued, we cannot refer to skb->dev anymore. From d569f1d72f068992d07ab17f7ff9aea7f0d97cdb Mon Sep 17 00:00:00 2001 From: Guillaume Chazarain Date: Mon, 24 Jul 2006 23:45:16 -0700 Subject: [PATCH 33/67] [IPV4]: Clear the whole IPCB, this clears also IPCB(skb)->flags. Signed-off-by: Guillaume Chazarain Signed-off-by: David S. Miller --- net/ipv4/ip_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 184c78ca79e6..212734ca238f 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -429,7 +429,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, } /* Remove any debris in the socket control block */ - memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); + memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); return NF_HOOK(PF_INET, NF_IP_PRE_ROUTING, skb, dev, NULL, ip_rcv_finish); From 7b30f09245d0e6868819b946b2f6879e5d3d106b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 25 Jul 2006 15:02:48 +0200 Subject: [PATCH 34/67] [PATCH] cciss: fix stall with softirq handling and CFQ We need to postpone the queue startup until after the softirq handler has actually finished some requests, otherwise we could be racing with cciss_softirq_done() and not actually restart the queue handling. Signed-off-by: Jens Axboe --- drivers/block/cciss.c | 86 ++++++++++++++++++++++--------------------- 1 file changed, 45 insertions(+), 41 deletions(-) diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index 1c4df22dfd2a..7b0eca703a67 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -1233,6 +1233,50 @@ static inline void complete_buffers(struct bio *bio, int status) } } +static void cciss_check_queues(ctlr_info_t *h) +{ + int start_queue = h->next_to_run; + int i; + + /* check to see if we have maxed out the number of commands that can + * be placed on the queue. If so then exit. We do this check here + * in case the interrupt we serviced was from an ioctl and did not + * free any new commands. + */ + if ((find_first_zero_bit(h->cmd_pool_bits, NR_CMDS)) == NR_CMDS) + return; + + /* We have room on the queue for more commands. Now we need to queue + * them up. We will also keep track of the next queue to run so + * that every queue gets a chance to be started first. + */ + for (i = 0; i < h->highest_lun + 1; i++) { + int curr_queue = (start_queue + i) % (h->highest_lun + 1); + /* make sure the disk has been added and the drive is real + * because this can be called from the middle of init_one. + */ + if (!(h->drv[curr_queue].queue) || !(h->drv[curr_queue].heads)) + continue; + blk_start_queue(h->gendisk[curr_queue]->queue); + + /* check to see if we have maxed out the number of commands + * that can be placed on the queue. + */ + if ((find_first_zero_bit(h->cmd_pool_bits, NR_CMDS)) == NR_CMDS) { + if (curr_queue == start_queue) { + h->next_to_run = + (start_queue + 1) % (h->highest_lun + 1); + break; + } else { + h->next_to_run = curr_queue; + break; + } + } else { + curr_queue = (curr_queue + 1) % (h->highest_lun + 1); + } + } +} + static void cciss_softirq_done(struct request *rq) { CommandList_struct *cmd = rq->completion_data; @@ -1264,6 +1308,7 @@ static void cciss_softirq_done(struct request *rq) spin_lock_irqsave(&h->lock, flags); end_that_request_last(rq, rq->errors); cmd_free(h, cmd, 1); + cciss_check_queues(h); spin_unlock_irqrestore(&h->lock, flags); } @@ -2528,8 +2573,6 @@ static irqreturn_t do_cciss_intr(int irq, void *dev_id, struct pt_regs *regs) CommandList_struct *c; unsigned long flags; __u32 a, a1, a2; - int j; - int start_queue = h->next_to_run; if (interrupt_not_for_us(h)) return IRQ_NONE; @@ -2588,45 +2631,6 @@ static irqreturn_t do_cciss_intr(int irq, void *dev_id, struct pt_regs *regs) } } - /* check to see if we have maxed out the number of commands that can - * be placed on the queue. If so then exit. We do this check here - * in case the interrupt we serviced was from an ioctl and did not - * free any new commands. - */ - if ((find_first_zero_bit(h->cmd_pool_bits, NR_CMDS)) == NR_CMDS) - goto cleanup; - - /* We have room on the queue for more commands. Now we need to queue - * them up. We will also keep track of the next queue to run so - * that every queue gets a chance to be started first. - */ - for (j = 0; j < h->highest_lun + 1; j++) { - int curr_queue = (start_queue + j) % (h->highest_lun + 1); - /* make sure the disk has been added and the drive is real - * because this can be called from the middle of init_one. - */ - if (!(h->drv[curr_queue].queue) || !(h->drv[curr_queue].heads)) - continue; - blk_start_queue(h->gendisk[curr_queue]->queue); - - /* check to see if we have maxed out the number of commands - * that can be placed on the queue. - */ - if ((find_first_zero_bit(h->cmd_pool_bits, NR_CMDS)) == NR_CMDS) { - if (curr_queue == start_queue) { - h->next_to_run = - (start_queue + 1) % (h->highest_lun + 1); - goto cleanup; - } else { - h->next_to_run = curr_queue; - goto cleanup; - } - } else { - curr_queue = (curr_queue + 1) % (h->highest_lun + 1); - } - } - - cleanup: spin_unlock_irqrestore(CCISS_LOCK(h->ctlr), flags); return IRQ_HANDLED; } From ad01b1ca797e5898cd40bb32cf0dc8c85aa9f7e8 Mon Sep 17 00:00:00 2001 From: Milton Miller Date: Tue, 25 Jul 2006 15:04:13 +0200 Subject: [PATCH 35/67] [PATCH] blktrace: fix read-ahead bit It should be toggling the same bit on and off, fix it up. Signed-off-by: Jens Axboe --- block/blktrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blktrace.c b/block/blktrace.c index b8c0702777ff..265f7a830619 100644 --- a/block/blktrace.c +++ b/block/blktrace.c @@ -80,7 +80,7 @@ static u32 bio_act[5] __read_mostly = { 0, BLK_TC_ACT(BLK_TC_BARRIER), BLK_TC_AC #define trace_sync_bit(rw) \ (((rw) & (1 << BIO_RW_SYNC)) >> (BIO_RW_SYNC - 1)) #define trace_ahead_bit(rw) \ - (((rw) & (1 << BIO_RW_AHEAD)) << (BIO_RW_AHEAD - 0)) + (((rw) & (1 << BIO_RW_AHEAD)) << (2 - BIO_RW_AHEAD)) /* * The worker for the various blk_add_trace*() types. Fills out a From 44eb123126d289bac398cac0232309c228386671 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 25 Jul 2006 15:05:21 +0200 Subject: [PATCH 36/67] [PATCH] cfq-iosched: don't use a hard jiffies value, translate from msecs The CIC_SEEKY() test really wants to use the minimum of either: - 2 msecs (not jiffies) - or, the pending slice time So code it like that. Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 102ebc2c5c34..aae3123bf3ee 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -936,7 +936,7 @@ static int cfq_arm_slice_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq) * seeks. so allow a little bit of time for him to submit a new rq */ if (sample_valid(cic->seek_samples) && CIC_SEEKY(cic)) - sl = 2; + sl = min(sl, msecs_to_jiffies(2)); mod_timer(&cfqd->idle_slice_timer, jiffies + sl); return 1; From b9ec6c1b917e2e43a058a78198d54aeca3d71c6f Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Tue, 25 Jul 2006 16:37:27 -0700 Subject: [PATCH 37/67] [TG3]: Add tg3_restart_hw() Add tg3_restart_hw() to handle failures when re-initializing the device. Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/tg3.c | 80 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 58 insertions(+), 22 deletions(-) diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c index ce6f3be86da0..1253cec6ebdc 100644 --- a/drivers/net/tg3.c +++ b/drivers/net/tg3.c @@ -3590,6 +3590,28 @@ static irqreturn_t tg3_test_isr(int irq, void *dev_id, static int tg3_init_hw(struct tg3 *, int); static int tg3_halt(struct tg3 *, int, int); +/* Restart hardware after configuration changes, self-test, etc. + * Invoked with tp->lock held. + */ +static int tg3_restart_hw(struct tg3 *tp, int reset_phy) +{ + int err; + + err = tg3_init_hw(tp, reset_phy); + if (err) { + printk(KERN_ERR PFX "%s: Failed to re-initialize device, " + "aborting.\n", tp->dev->name); + tg3_halt(tp, RESET_KIND_SHUTDOWN, 1); + tg3_full_unlock(tp); + del_timer_sync(&tp->timer); + tp->irq_sync = 0; + netif_poll_enable(tp->dev); + dev_close(tp->dev); + tg3_full_lock(tp, 0); + } + return err; +} + #ifdef CONFIG_NET_POLL_CONTROLLER static void tg3_poll_controller(struct net_device *dev) { @@ -3630,13 +3652,15 @@ static void tg3_reset_task(void *_data) } tg3_halt(tp, RESET_KIND_SHUTDOWN, 0); - tg3_init_hw(tp, 1); + if (tg3_init_hw(tp, 1)) + goto out; tg3_netif_start(tp); if (restart_timer) mod_timer(&tp->timer, jiffies + 1); +out: tp->tg3_flags &= ~TG3_FLAG_IN_RESET_TASK; tg3_full_unlock(tp); @@ -4124,6 +4148,7 @@ static inline void tg3_set_mtu(struct net_device *dev, struct tg3 *tp, static int tg3_change_mtu(struct net_device *dev, int new_mtu) { struct tg3 *tp = netdev_priv(dev); + int err; if (new_mtu < TG3_MIN_MTU || new_mtu > TG3_MAX_MTU(tp)) return -EINVAL; @@ -4144,13 +4169,14 @@ static int tg3_change_mtu(struct net_device *dev, int new_mtu) tg3_set_mtu(dev, tp, new_mtu); - tg3_init_hw(tp, 0); + err = tg3_restart_hw(tp, 0); - tg3_netif_start(tp); + if (!err) + tg3_netif_start(tp); tg3_full_unlock(tp); - return 0; + return err; } /* Free up pending packets in all rx/tx rings. @@ -5815,6 +5841,7 @@ static int tg3_set_mac_addr(struct net_device *dev, void *p) { struct tg3 *tp = netdev_priv(dev); struct sockaddr *addr = p; + int err = 0; if (!is_valid_ether_addr(addr->sa_data)) return -EINVAL; @@ -5832,9 +5859,9 @@ static int tg3_set_mac_addr(struct net_device *dev, void *p) tg3_full_lock(tp, 1); tg3_halt(tp, RESET_KIND_SHUTDOWN, 1); - tg3_init_hw(tp, 0); - - tg3_netif_start(tp); + err = tg3_restart_hw(tp, 0); + if (!err) + tg3_netif_start(tp); tg3_full_unlock(tp); } else { spin_lock_bh(&tp->lock); @@ -5842,7 +5869,7 @@ static int tg3_set_mac_addr(struct net_device *dev, void *p) spin_unlock_bh(&tp->lock); } - return 0; + return err; } /* tp->lock is held. */ @@ -7956,7 +7983,7 @@ static void tg3_get_ringparam(struct net_device *dev, struct ethtool_ringparam * static int tg3_set_ringparam(struct net_device *dev, struct ethtool_ringparam *ering) { struct tg3 *tp = netdev_priv(dev); - int irq_sync = 0; + int irq_sync = 0, err = 0; if ((ering->rx_pending > TG3_RX_RING_SIZE - 1) || (ering->rx_jumbo_pending > TG3_RX_JUMBO_RING_SIZE - 1) || @@ -7980,13 +8007,14 @@ static int tg3_set_ringparam(struct net_device *dev, struct ethtool_ringparam *e if (netif_running(dev)) { tg3_halt(tp, RESET_KIND_SHUTDOWN, 1); - tg3_init_hw(tp, 1); - tg3_netif_start(tp); + err = tg3_restart_hw(tp, 1); + if (!err) + tg3_netif_start(tp); } tg3_full_unlock(tp); - return 0; + return err; } static void tg3_get_pauseparam(struct net_device *dev, struct ethtool_pauseparam *epause) @@ -8001,7 +8029,7 @@ static void tg3_get_pauseparam(struct net_device *dev, struct ethtool_pauseparam static int tg3_set_pauseparam(struct net_device *dev, struct ethtool_pauseparam *epause) { struct tg3 *tp = netdev_priv(dev); - int irq_sync = 0; + int irq_sync = 0, err = 0; if (netif_running(dev)) { tg3_netif_stop(tp); @@ -8025,13 +8053,14 @@ static int tg3_set_pauseparam(struct net_device *dev, struct ethtool_pauseparam if (netif_running(dev)) { tg3_halt(tp, RESET_KIND_SHUTDOWN, 1); - tg3_init_hw(tp, 1); - tg3_netif_start(tp); + err = tg3_restart_hw(tp, 1); + if (!err) + tg3_netif_start(tp); } tg3_full_unlock(tp); - return 0; + return err; } static u32 tg3_get_rx_csum(struct net_device *dev) @@ -8666,7 +8695,9 @@ static int tg3_test_loopback(struct tg3 *tp) if (!netif_running(tp->dev)) return TG3_LOOPBACK_FAILED; - tg3_reset_hw(tp, 1); + err = tg3_reset_hw(tp, 1); + if (err) + return TG3_LOOPBACK_FAILED; if (tg3_run_loopback(tp, TG3_MAC_LOOPBACK)) err |= TG3_MAC_LOOPBACK_FAILED; @@ -8740,8 +8771,8 @@ static void tg3_self_test(struct net_device *dev, struct ethtool_test *etest, tg3_halt(tp, RESET_KIND_SHUTDOWN, 1); if (netif_running(dev)) { tp->tg3_flags |= TG3_FLAG_INIT_COMPLETE; - tg3_init_hw(tp, 1); - tg3_netif_start(tp); + if (!tg3_restart_hw(tp, 1)) + tg3_netif_start(tp); } tg3_full_unlock(tp); @@ -11699,7 +11730,8 @@ static int tg3_suspend(struct pci_dev *pdev, pm_message_t state) tg3_full_lock(tp, 0); tp->tg3_flags |= TG3_FLAG_INIT_COMPLETE; - tg3_init_hw(tp, 1); + if (tg3_restart_hw(tp, 1)) + goto out; tp->timer.expires = jiffies + tp->timer_offset; add_timer(&tp->timer); @@ -11707,6 +11739,7 @@ static int tg3_suspend(struct pci_dev *pdev, pm_message_t state) netif_device_attach(dev); tg3_netif_start(tp); +out: tg3_full_unlock(tp); } @@ -11733,16 +11766,19 @@ static int tg3_resume(struct pci_dev *pdev) tg3_full_lock(tp, 0); tp->tg3_flags |= TG3_FLAG_INIT_COMPLETE; - tg3_init_hw(tp, 1); + err = tg3_restart_hw(tp, 1); + if (err) + goto out; tp->timer.expires = jiffies + tp->timer_offset; add_timer(&tp->timer); tg3_netif_start(tp); +out: tg3_full_unlock(tp); - return 0; + return err; } static struct pci_driver tg3_driver = { From 32d8c5724b7b05c7d8f7386c49432104cc222e32 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Tue, 25 Jul 2006 16:38:29 -0700 Subject: [PATCH 38/67] [TG3]: Handle tg3_init_rings() failures Handle dev_alloc_skb() failures when initializing the RX rings. Without proper handling, the driver will crash when using a partial ring. Thanks to Stephane Doyon for reporting the bug and providing the initial patch. Howie Xu also reported the same issue. Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/tg3.c | 32 +++++++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c index 1253cec6ebdc..d66b06f2b865 100644 --- a/drivers/net/tg3.c +++ b/drivers/net/tg3.c @@ -4258,7 +4258,7 @@ static void tg3_free_rings(struct tg3 *tp) * end up in the driver. tp->{tx,}lock are held and thus * we may not sleep. */ -static void tg3_init_rings(struct tg3 *tp) +static int tg3_init_rings(struct tg3 *tp) { u32 i; @@ -4307,18 +4307,38 @@ static void tg3_init_rings(struct tg3 *tp) /* Now allocate fresh SKBs for each rx ring. */ for (i = 0; i < tp->rx_pending; i++) { - if (tg3_alloc_rx_skb(tp, RXD_OPAQUE_RING_STD, - -1, i) < 0) + if (tg3_alloc_rx_skb(tp, RXD_OPAQUE_RING_STD, -1, i) < 0) { + printk(KERN_WARNING PFX + "%s: Using a smaller RX standard ring, " + "only %d out of %d buffers were allocated " + "successfully.\n", + tp->dev->name, i, tp->rx_pending); + if (i == 0) + return -ENOMEM; + tp->rx_pending = i; break; + } } if (tp->tg3_flags & TG3_FLAG_JUMBO_RING_ENABLE) { for (i = 0; i < tp->rx_jumbo_pending; i++) { if (tg3_alloc_rx_skb(tp, RXD_OPAQUE_RING_JUMBO, - -1, i) < 0) + -1, i) < 0) { + printk(KERN_WARNING PFX + "%s: Using a smaller RX jumbo ring, " + "only %d out of %d buffers were " + "allocated successfully.\n", + tp->dev->name, i, tp->rx_jumbo_pending); + if (i == 0) { + tg3_free_rings(tp); + return -ENOMEM; + } + tp->rx_jumbo_pending = i; break; + } } } + return 0; } /* @@ -5969,7 +5989,9 @@ static int tg3_reset_hw(struct tg3 *tp, int reset_phy) * can only do this after the hardware has been * successfully reset. */ - tg3_init_rings(tp); + err = tg3_init_rings(tp); + if (err) + return err; /* This value is determined during the probe time DMA * engine test, tg3_test_dma. From b6e77a5346d8a739227ed73c2269966a4fd652b4 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Tue, 25 Jul 2006 16:39:12 -0700 Subject: [PATCH 39/67] [TG3]: Update version and reldate Update version to 3.63. Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/tg3.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c index d66b06f2b865..1b8138f641e3 100644 --- a/drivers/net/tg3.c +++ b/drivers/net/tg3.c @@ -68,8 +68,8 @@ #define DRV_MODULE_NAME "tg3" #define PFX DRV_MODULE_NAME ": " -#define DRV_MODULE_VERSION "3.62" -#define DRV_MODULE_RELDATE "June 30, 2006" +#define DRV_MODULE_VERSION "3.63" +#define DRV_MODULE_RELDATE "July 25, 2006" #define TG3_DEF_MAC_MODE 0 #define TG3_DEF_RX_MODE 0 From 722874909271a807b243a797c2958e0a12992964 Mon Sep 17 00:00:00 2001 From: Alexey Kuznetsov Date: Tue, 25 Jul 2006 16:45:12 -0700 Subject: [PATCH 40/67] [IPV4] ipmr: ip multicast route bug fix. IP multicast route code was reusing an skb which causes use after free and double free. From: Alexey Kuznetsov Note, it is real skb_clone(), not alloc_skb(). Equeued skb contains the whole half-prepared netlink message plus room for the rest. It could be also skb_copy(), if we want to be puristic about mangling cloned data, but original copy is really not going to be used. Acked-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/ipmr.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 9ccacf57f08b..85893eef6b16 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1578,6 +1578,7 @@ int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait) cache = ipmr_cache_find(rt->rt_src, rt->rt_dst); if (cache==NULL) { + struct sk_buff *skb2; struct net_device *dev; int vif; @@ -1591,12 +1592,18 @@ int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait) read_unlock(&mrt_lock); return -ENODEV; } - skb->nh.raw = skb_push(skb, sizeof(struct iphdr)); - skb->nh.iph->ihl = sizeof(struct iphdr)>>2; - skb->nh.iph->saddr = rt->rt_src; - skb->nh.iph->daddr = rt->rt_dst; - skb->nh.iph->version = 0; - err = ipmr_cache_unresolved(vif, skb); + skb2 = skb_clone(skb, GFP_ATOMIC); + if (!skb2) { + read_unlock(&mrt_lock); + return -ENOMEM; + } + + skb2->nh.raw = skb_push(skb2, sizeof(struct iphdr)); + skb2->nh.iph->ihl = sizeof(struct iphdr)>>2; + skb2->nh.iph->saddr = rt->rt_src; + skb2->nh.iph->daddr = rt->rt_dst; + skb2->nh.iph->version = 0; + err = ipmr_cache_unresolved(vif, skb2); read_unlock(&mrt_lock); return err; } From f59fc7f30b710d45aadf715460b3e60dbe9d3418 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Tue, 25 Jul 2006 17:05:35 -0700 Subject: [PATCH 41/67] [IPV4/IPV6]: Setting 0 for unused port field in RAW IP recvmsg(). From: Tetsuo Handa from-linux-kernel@i-love.sakura.ne.jp The recvmsg() for raw socket seems to return random u16 value from the kernel stack memory since port field is not initialized. But I'm not sure this patch is correct. Does raw socket return any information stored in port field? [ BSD defines RAW IP recvmsg to return a sin_port value of zero. This is described in Steven's TCP/IP Illustrated Volume 2 on page 1055, which is discussing the BSD rip_input() implementation. ] Acked-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- net/ipv4/raw.c | 1 + net/ipv6/raw.c | 1 + 2 files changed, 2 insertions(+) diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index bd221ec3f81e..62b2762a2420 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -609,6 +609,7 @@ static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, if (sin) { sin->sin_family = AF_INET; sin->sin_addr.s_addr = skb->nh.iph->saddr; + sin->sin_port = 0; memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); } if (inet->cmsg_flags) diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index fa1ce0ae123e..d57e61ce4a7d 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -411,6 +411,7 @@ static int rawv6_recvmsg(struct kiocb *iocb, struct sock *sk, /* Copy the address. */ if (sin6) { sin6->sin6_family = AF_INET6; + sin6->sin6_port = 0; ipv6_addr_copy(&sin6->sin6_addr, &skb->nh.ipv6h->saddr); sin6->sin6_flowinfo = 0; sin6->sin6_scope_id = 0; From 153d7f3fcae7ed4e19328549aa9467acdfbced10 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Wed, 26 Jul 2006 15:40:07 +0200 Subject: [PATCH 42/67] [PATCH] Reorganize the cpufreq cpu hotplug locking to not be totally bizare The patch below moves the cpu hotplugging higher up in the cpufreq layering; this is needed to avoid recursive taking of the cpu hotplug lock and to otherwise detangle the mess. The new rules are: 1. you must do lock_cpu_hotplug() around the following functions: __cpufreq_driver_target __cpufreq_governor (for CPUFREQ_GOV_LIMITS operation only) __cpufreq_set_policy 2. governer methods (.governer) must NOT take the lock_cpu_hotplug() lock in any way; they are called with the lock taken already 3. if your governer spawns a thread that does things, like calling __cpufreq_driver_target, your thread must honor rule #1. 4. the policy lock and other cpufreq internal locks nest within the lock_cpu_hotplug() lock. I'm not entirely happy about how the __cpufreq_governor rule ended up (conditional locking rule depending on the argument) but basically all callers pass this as a constant so it's not too horrible. The patch also removes the cpufreq_governor() function since during the locking audit it turned out to be entirely unused (so no need to fix it) The patch works on my testbox, but it could use more testing (otoh... it can't be much worse than the current code) Signed-off-by: Arjan van de Ven Signed-off-by: Linus Torvalds --- drivers/cpufreq/cpufreq.c | 40 ++++++++++++-------------- drivers/cpufreq/cpufreq_conservative.c | 2 -- drivers/cpufreq/cpufreq_ondemand.c | 4 +-- drivers/cpufreq/cpufreq_userspace.c | 3 ++ include/linux/cpufreq.h | 3 -- 5 files changed, 23 insertions(+), 29 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 8d328186f774..bc1088d9b379 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -364,10 +364,12 @@ static ssize_t store_##file_name \ if (ret != 1) \ return -EINVAL; \ \ + lock_cpu_hotplug(); \ mutex_lock(&policy->lock); \ ret = __cpufreq_set_policy(policy, &new_policy); \ policy->user_policy.object = policy->object; \ mutex_unlock(&policy->lock); \ + unlock_cpu_hotplug(); \ \ return ret ? ret : count; \ } @@ -1197,20 +1199,18 @@ EXPORT_SYMBOL(cpufreq_unregister_notifier); *********************************************************************/ +/* Must be called with lock_cpu_hotplug held */ int __cpufreq_driver_target(struct cpufreq_policy *policy, unsigned int target_freq, unsigned int relation) { int retval = -EINVAL; - lock_cpu_hotplug(); dprintk("target for CPU %u: %u kHz, relation %u\n", policy->cpu, target_freq, relation); if (cpu_online(policy->cpu) && cpufreq_driver->target) retval = cpufreq_driver->target(policy, target_freq, relation); - unlock_cpu_hotplug(); - return retval; } EXPORT_SYMBOL_GPL(__cpufreq_driver_target); @@ -1225,17 +1225,23 @@ int cpufreq_driver_target(struct cpufreq_policy *policy, if (!policy) return -EINVAL; + lock_cpu_hotplug(); mutex_lock(&policy->lock); ret = __cpufreq_driver_target(policy, target_freq, relation); mutex_unlock(&policy->lock); + unlock_cpu_hotplug(); cpufreq_cpu_put(policy); return ret; } EXPORT_SYMBOL_GPL(cpufreq_driver_target); +/* + * Locking: Must be called with the lock_cpu_hotplug() lock held + * when "event" is CPUFREQ_GOV_LIMITS + */ static int __cpufreq_governor(struct cpufreq_policy *policy, unsigned int event) { @@ -1257,24 +1263,6 @@ static int __cpufreq_governor(struct cpufreq_policy *policy, unsigned int event) } -int cpufreq_governor(unsigned int cpu, unsigned int event) -{ - int ret = 0; - struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); - - if (!policy) - return -EINVAL; - - mutex_lock(&policy->lock); - ret = __cpufreq_governor(policy, event); - mutex_unlock(&policy->lock); - - cpufreq_cpu_put(policy); - return ret; -} -EXPORT_SYMBOL_GPL(cpufreq_governor); - - int cpufreq_register_governor(struct cpufreq_governor *governor) { struct cpufreq_governor *t; @@ -1342,6 +1330,9 @@ int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu) EXPORT_SYMBOL(cpufreq_get_policy); +/* + * Locking: Must be called with the lock_cpu_hotplug() lock held + */ static int __cpufreq_set_policy(struct cpufreq_policy *data, struct cpufreq_policy *policy) { int ret = 0; @@ -1436,6 +1427,8 @@ int cpufreq_set_policy(struct cpufreq_policy *policy) if (!data) return -EINVAL; + lock_cpu_hotplug(); + /* lock this CPU */ mutex_lock(&data->lock); @@ -1446,6 +1439,8 @@ int cpufreq_set_policy(struct cpufreq_policy *policy) data->user_policy.governor = data->governor; mutex_unlock(&data->lock); + + unlock_cpu_hotplug(); cpufreq_cpu_put(data); return ret; @@ -1469,6 +1464,7 @@ int cpufreq_update_policy(unsigned int cpu) if (!data) return -ENODEV; + lock_cpu_hotplug(); mutex_lock(&data->lock); dprintk("updating policy for CPU %u\n", cpu); @@ -1494,7 +1490,7 @@ int cpufreq_update_policy(unsigned int cpu) ret = __cpufreq_set_policy(data, &policy); mutex_unlock(&data->lock); - + unlock_cpu_hotplug(); cpufreq_cpu_put(data); return ret; } diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c index b3ebc8f01975..c4c578defabf 100644 --- a/drivers/cpufreq/cpufreq_conservative.c +++ b/drivers/cpufreq/cpufreq_conservative.c @@ -525,7 +525,6 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, break; case CPUFREQ_GOV_LIMITS: - lock_cpu_hotplug(); mutex_lock(&dbs_mutex); if (policy->max < this_dbs_info->cur_policy->cur) __cpufreq_driver_target( @@ -536,7 +535,6 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, this_dbs_info->cur_policy, policy->min, CPUFREQ_RELATION_L); mutex_unlock(&dbs_mutex); - unlock_cpu_hotplug(); break; } return 0; diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c index 178f0c547eb7..52cf1f021825 100644 --- a/drivers/cpufreq/cpufreq_ondemand.c +++ b/drivers/cpufreq/cpufreq_ondemand.c @@ -309,7 +309,9 @@ static void do_dbs_timer(void *data) if (!dbs_info->enable) return; + lock_cpu_hotplug(); dbs_check_cpu(dbs_info); + unlock_cpu_hotplug(); queue_delayed_work_on(cpu, kondemand_wq, &dbs_info->work, usecs_to_jiffies(dbs_tuners_ins.sampling_rate)); } @@ -412,7 +414,6 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, break; case CPUFREQ_GOV_LIMITS: - lock_cpu_hotplug(); mutex_lock(&dbs_mutex); if (policy->max < this_dbs_info->cur_policy->cur) __cpufreq_driver_target(this_dbs_info->cur_policy, @@ -423,7 +424,6 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, policy->min, CPUFREQ_RELATION_L); mutex_unlock(&dbs_mutex); - unlock_cpu_hotplug(); break; } return 0; diff --git a/drivers/cpufreq/cpufreq_userspace.c b/drivers/cpufreq/cpufreq_userspace.c index 44ae5e5b94cf..a06c204589cd 100644 --- a/drivers/cpufreq/cpufreq_userspace.c +++ b/drivers/cpufreq/cpufreq_userspace.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -70,6 +71,7 @@ static int cpufreq_set(unsigned int freq, struct cpufreq_policy *policy) dprintk("cpufreq_set for cpu %u, freq %u kHz\n", policy->cpu, freq); + lock_cpu_hotplug(); mutex_lock(&userspace_mutex); if (!cpu_is_managed[policy->cpu]) goto err; @@ -92,6 +94,7 @@ static int cpufreq_set(unsigned int freq, struct cpufreq_policy *policy) err: mutex_unlock(&userspace_mutex); + unlock_cpu_hotplug(); return ret; } diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 35e137636b0b..4ea39fee99c7 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -172,9 +172,6 @@ extern int __cpufreq_driver_target(struct cpufreq_policy *policy, unsigned int relation); -/* pass an event to the cpufreq governor */ -int cpufreq_governor(unsigned int cpu, unsigned int event); - int cpufreq_register_governor(struct cpufreq_governor *governor); void cpufreq_unregister_governor(struct cpufreq_governor *governor); From 64821324ca49f24be1a66f2f432108f96a24e596 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 26 Jul 2006 09:53:23 +0200 Subject: [PATCH 43/67] [PATCH] fix compile regression for a few scsi drivers This fixes three drivers to compile again after my patch that removes the data_cmnd member from struct scsi_cmnd. The fas216 change is trivial, it should have been using ->cmnd all the time. NCR53C9 (which seem to be mostly duplicate driver with esp.c!) is doing something odd, it should only have looked at ->cmnd before not the saved copy that is kept for the error handlers sake. Note that it really should deal with the sync setting themselves but use the generic domain validation code that get this right - but that's for later let's push this simple compile fix for now. And sorry for the late fix for this, I have been busy with OLS and associated activities last week. Signed-off-by: Christoph Hellwig Signed-off-by: Linus Torvalds --- drivers/scsi/NCR53C9x.c | 16 +++++----------- drivers/scsi/arm/fas216.c | 2 +- 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/drivers/scsi/NCR53C9x.c b/drivers/scsi/NCR53C9x.c index 085db4826e0e..bdc6bb262bce 100644 --- a/drivers/scsi/NCR53C9x.c +++ b/drivers/scsi/NCR53C9x.c @@ -2152,29 +2152,23 @@ static int esp_do_data_finale(struct NCR_ESP *esp, */ static int esp_should_clear_sync(Scsi_Cmnd *sp) { - unchar cmd1 = sp->cmnd[0]; - unchar cmd2 = sp->data_cmnd[0]; + unchar cmd = sp->cmnd[0]; /* These cases are for spinning up a disk and * waiting for that spinup to complete. */ - if(cmd1 == START_STOP || - cmd2 == START_STOP) + if(cmd == START_STOP) return 0; - if(cmd1 == TEST_UNIT_READY || - cmd2 == TEST_UNIT_READY) + if(cmd == TEST_UNIT_READY) return 0; /* One more special case for SCSI tape drives, * this is what is used to probe the device for * completion of a rewind or tape load operation. */ - if(sp->device->type == TYPE_TAPE) { - if(cmd1 == MODE_SENSE || - cmd2 == MODE_SENSE) - return 0; - } + if(sp->device->type == TYPE_TAPE && cmd == MODE_SENSE) + return 0; return 1; } diff --git a/drivers/scsi/arm/fas216.c b/drivers/scsi/arm/fas216.c index 3e1053f111dc..4cf7afc31cc7 100644 --- a/drivers/scsi/arm/fas216.c +++ b/drivers/scsi/arm/fas216.c @@ -2427,7 +2427,7 @@ int fas216_eh_abort(Scsi_Cmnd *SCpnt) info->stats.aborts += 1; printk(KERN_WARNING "scsi%d: abort command ", info->host->host_no); - __scsi_print_command(SCpnt->data_cmnd); + __scsi_print_command(SCpnt->cmnd); print_debug_list(); fas216_dumpstate(info); From ba4ba8a69dcb446450b5ddeca48a7bd15783f4c2 Mon Sep 17 00:00:00 2001 From: Peter Oberparleiter Date: Thu, 27 Jul 2006 14:00:23 +0200 Subject: [PATCH 44/67] [S390] permanent subchannel busy conditions may cause I/O stall In special conditions where a subchannel rejects the HALT I/O- instruction with a busy indication (cc 2), I/O may stall. I/O request termination logic retries HALT I/O indefinitely because it expects HALT I/O to alter the subchannel status which is not true when cc 2 is returned. In case of a busy indication, try CLEAR I/O instruction immediately. Signed-off-by: Peter Oberparleiter Signed-off-by: Martin Schwidefsky --- drivers/s390/cio/device_fsm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/s390/cio/device_fsm.c b/drivers/s390/cio/device_fsm.c index ac6e0c7e43d9..7a39e0b0386c 100644 --- a/drivers/s390/cio/device_fsm.c +++ b/drivers/s390/cio/device_fsm.c @@ -152,7 +152,8 @@ ccw_device_cancel_halt_clear(struct ccw_device *cdev) if (cdev->private->iretry) { cdev->private->iretry--; ret = cio_halt(sch); - return (ret == 0) ? -EBUSY : ret; + if (ret != -EBUSY) + return (ret == 0) ? -EBUSY : ret; } /* halt io unsuccessful. */ cdev->private->iretry = 255; /* 255 clear retries. */ From 17088229846c078aa936ca64912ab221d083aca1 Mon Sep 17 00:00:00 2001 From: Cornelia Huck Date: Thu, 27 Jul 2006 14:00:33 +0200 Subject: [PATCH 45/67] [S390] duplicate ccw devices in ccwgroup. Fail to create a ccwgroup device if a ccw device is passed in twice. Signed-off-by: Cornelia Huck Signed-off-by: Martin Schwidefsky --- drivers/s390/cio/ccwgroup.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/drivers/s390/cio/ccwgroup.c b/drivers/s390/cio/ccwgroup.c index f26a2ee3aad8..3cba6c9fab11 100644 --- a/drivers/s390/cio/ccwgroup.c +++ b/drivers/s390/cio/ccwgroup.c @@ -152,7 +152,6 @@ ccwgroup_create(struct device *root, struct ccwgroup_device *gdev; int i; int rc; - int del_drvdata; if (argc > 256) /* disallow dumb users */ return -EINVAL; @@ -163,7 +162,6 @@ ccwgroup_create(struct device *root, atomic_set(&gdev->onoff, 0); - del_drvdata = 0; for (i = 0; i < argc; i++) { gdev->cdev[i] = get_ccwdev_by_busid(cdrv, argv[i]); @@ -180,10 +178,8 @@ ccwgroup_create(struct device *root, rc = -EINVAL; goto free_dev; } - } - for (i = 0; i < argc; i++) gdev->cdev[i]->dev.driver_data = gdev; - del_drvdata = 1; + } gdev->creator_id = creator_id; gdev->count = argc; @@ -226,9 +222,9 @@ ccwgroup_create(struct device *root, free_dev: for (i = 0; i < argc; i++) if (gdev->cdev[i]) { - put_device(&gdev->cdev[i]->dev); - if (del_drvdata) + if (gdev->cdev[i]->dev.driver_data == gdev) gdev->cdev[i]->dev.driver_data = NULL; + put_device(&gdev->cdev[i]->dev); } kfree(gdev); return rc; From 468310a8a7af4f3933ade2700f01d493fa1a9754 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Thu, 27 Jul 2006 14:04:57 +0200 Subject: [PATCH 46/67] [S390] update default configuration Signed-off-by: Martin Schwidefsky --- arch/s390/defconfig | 44 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 35 insertions(+), 9 deletions(-) diff --git a/arch/s390/defconfig b/arch/s390/defconfig index f4dfc10026d2..f1d4591eddbb 100644 --- a/arch/s390/defconfig +++ b/arch/s390/defconfig @@ -1,13 +1,16 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.17-rc1 -# Mon Apr 3 14:34:15 2006 +# Linux kernel version: 2.6.18-rc2 +# Thu Jul 27 13:51:07 2006 # CONFIG_MMU=y +CONFIG_LOCKDEP_SUPPORT=y +CONFIG_STACKTRACE_SUPPORT=y CONFIG_RWSEM_XCHGADD_ALGORITHM=y CONFIG_GENERIC_HWEIGHT=y CONFIG_GENERIC_CALIBRATE_DELAY=y CONFIG_S390=y +CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config" # # Code maturity level options @@ -25,6 +28,7 @@ CONFIG_SWAP=y CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y # CONFIG_BSD_PROCESS_ACCT is not set +# CONFIG_TASKSTATS is not set CONFIG_SYSCTL=y CONFIG_AUDIT=y # CONFIG_AUDITSYSCALL is not set @@ -43,10 +47,12 @@ CONFIG_PRINTK=y CONFIG_BUG=y CONFIG_ELF_CORE=y CONFIG_BASE_FULL=y +CONFIG_RT_MUTEXES=y CONFIG_FUTEX=y CONFIG_EPOLL=y CONFIG_SHMEM=y CONFIG_SLAB=y +CONFIG_VM_EVENT_COUNTERS=y # CONFIG_TINY_SHMEM is not set CONFIG_BASE_SMALL=0 # CONFIG_SLOB is not set @@ -94,7 +100,6 @@ CONFIG_HOTPLUG_CPU=y CONFIG_DEFAULT_MIGRATION_COST=1000000 CONFIG_COMPAT=y CONFIG_SYSVIPC_COMPAT=y -CONFIG_BINFMT_ELF32=y # # Code generation options @@ -115,6 +120,7 @@ CONFIG_FLATMEM=y CONFIG_FLAT_NODE_MEM_MAP=y # CONFIG_SPARSEMEM_STATIC is not set CONFIG_SPLIT_PTLOCK_CPUS=4 +CONFIG_RESOURCES_64BIT=y # # I/O subsystem configuration @@ -142,6 +148,7 @@ CONFIG_VIRT_CPU_ACCOUNTING=y # CONFIG_APPLDATA_BASE is not set CONFIG_NO_IDLE_HZ=y CONFIG_NO_IDLE_HZ_INIT=y +CONFIG_S390_HYPFS_FS=y CONFIG_KEXEC=y # @@ -174,6 +181,8 @@ CONFIG_IP_FIB_HASH=y # CONFIG_INET_IPCOMP is not set # CONFIG_INET_XFRM_TUNNEL is not set # CONFIG_INET_TUNNEL is not set +CONFIG_INET_XFRM_MODE_TRANSPORT=y +CONFIG_INET_XFRM_MODE_TUNNEL=y CONFIG_INET_DIAG=y CONFIG_INET_TCP_DIAG=y # CONFIG_TCP_CONG_ADVANCED is not set @@ -186,7 +195,10 @@ CONFIG_IPV6=y # CONFIG_INET6_IPCOMP is not set # CONFIG_INET6_XFRM_TUNNEL is not set # CONFIG_INET6_TUNNEL is not set +CONFIG_INET6_XFRM_MODE_TRANSPORT=y +CONFIG_INET6_XFRM_MODE_TUNNEL=y # CONFIG_IPV6_TUNNEL is not set +# CONFIG_NETWORK_SECMARK is not set # CONFIG_NETFILTER is not set # @@ -263,6 +275,7 @@ CONFIG_NET_ESTIMATOR=y # Network testing # # CONFIG_NET_PKTGEN is not set +# CONFIG_NET_TCPPROBE is not set # CONFIG_HAMRADIO is not set # CONFIG_IRDA is not set # CONFIG_BT is not set @@ -276,6 +289,7 @@ CONFIG_STANDALONE=y CONFIG_PREVENT_FIRMWARE_BUILD=y # CONFIG_FW_LOADER is not set # CONFIG_DEBUG_DRIVER is not set +CONFIG_SYS_HYPERVISOR=y # # Connector - unified userspace <-> kernelspace linker @@ -334,6 +348,7 @@ CONFIG_BLK_DEV_NBD=m CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_COUNT=16 CONFIG_BLK_DEV_RAM_SIZE=4096 +CONFIG_BLK_DEV_RAM_BLOCKSIZE=1024 CONFIG_BLK_DEV_INITRD=y # CONFIG_CDROM_PKTCDVD is not set @@ -359,9 +374,7 @@ CONFIG_MD_LINEAR=m CONFIG_MD_RAID0=m CONFIG_MD_RAID1=m # CONFIG_MD_RAID10 is not set -CONFIG_MD_RAID5=m -# CONFIG_MD_RAID5_RESHAPE is not set -# CONFIG_MD_RAID6 is not set +# CONFIG_MD_RAID456 is not set CONFIG_MD_MULTIPATH=m # CONFIG_MD_FAULTY is not set CONFIG_BLK_DEV_DM=y @@ -419,7 +432,8 @@ CONFIG_S390_TAPE_34XX=m # # Cryptographic devices # -CONFIG_Z90CRYPT=m +CONFIG_ZCRYPT=m +# CONFIG_ZCRYPT_MONOLITHIC is not set # # Network device support @@ -509,6 +523,7 @@ CONFIG_FS_MBCACHE=y # CONFIG_MINIX_FS is not set # CONFIG_ROMFS_FS is not set CONFIG_INOTIFY=y +CONFIG_INOTIFY_USER=y # CONFIG_QUOTA is not set CONFIG_DNOTIFY=y # CONFIG_AUTOFS_FS is not set @@ -614,26 +629,36 @@ CONFIG_MSDOS_PARTITION=y # Instrumentation Support # # CONFIG_PROFILING is not set -# CONFIG_STATISTICS is not set +CONFIG_STATISTICS=y +CONFIG_KPROBES=y # # Kernel hacking # +CONFIG_TRACE_IRQFLAGS_SUPPORT=y # CONFIG_PRINTK_TIME is not set CONFIG_MAGIC_SYSRQ=y +# CONFIG_UNUSED_SYMBOLS is not set CONFIG_DEBUG_KERNEL=y CONFIG_LOG_BUF_SHIFT=17 # CONFIG_DETECT_SOFTLOCKUP is not set # CONFIG_SCHEDSTATS is not set # CONFIG_DEBUG_SLAB is not set CONFIG_DEBUG_PREEMPT=y -CONFIG_DEBUG_MUTEXES=y +# CONFIG_DEBUG_RT_MUTEXES is not set +# CONFIG_RT_MUTEX_TESTER is not set CONFIG_DEBUG_SPINLOCK=y +CONFIG_DEBUG_MUTEXES=y +# CONFIG_DEBUG_RWSEMS is not set +# CONFIG_DEBUG_LOCK_ALLOC is not set +# CONFIG_PROVE_LOCKING is not set CONFIG_DEBUG_SPINLOCK_SLEEP=y +# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set # CONFIG_DEBUG_KOBJECT is not set # CONFIG_DEBUG_INFO is not set CONFIG_DEBUG_FS=y # CONFIG_DEBUG_VM is not set +# CONFIG_FRAME_POINTER is not set # CONFIG_UNWIND_INFO is not set CONFIG_FORCED_INLINING=y # CONFIG_RCU_TORTURE_TEST is not set @@ -688,3 +713,4 @@ CONFIG_CRYPTO=y # CONFIG_CRC16 is not set CONFIG_CRC32=m # CONFIG_LIBCRC32C is not set +CONFIG_PLIST=y From 92f282988b4ce3967ee8399f7d1184ebfa04e48b Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 27 Jul 2006 16:49:21 -0700 Subject: [PATCH 47/67] [SPARC64]: Fix quad-float multiply emulation. Something is wrong with the 3-multiply (vs. 4-multiply) optimized version of _FP_MUL_MEAT_2_*(), so just use the slower version which actually computes correct values. Noticed by Rene Rebe Signed-off-by: David S. Miller --- include/asm-sparc64/sfp-machine.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/asm-sparc64/sfp-machine.h b/include/asm-sparc64/sfp-machine.h index 5015bb8d6c32..89d42431efb5 100644 --- a/include/asm-sparc64/sfp-machine.h +++ b/include/asm-sparc64/sfp-machine.h @@ -34,7 +34,7 @@ #define _FP_MUL_MEAT_D(R,X,Y) \ _FP_MUL_MEAT_1_wide(_FP_WFRACBITS_D,R,X,Y,umul_ppmm) #define _FP_MUL_MEAT_Q(R,X,Y) \ - _FP_MUL_MEAT_2_wide_3mul(_FP_WFRACBITS_Q,R,X,Y,umul_ppmm) + _FP_MUL_MEAT_2_wide(_FP_WFRACBITS_Q,R,X,Y,umul_ppmm) #define _FP_DIV_MEAT_S(R,X,Y) _FP_DIV_MEAT_1_imm(S,R,X,Y,_FP_DIV_HELP_imm) #define _FP_DIV_MEAT_D(R,X,Y) _FP_DIV_MEAT_1_udiv_norm(D,R,X,Y) From b8cfac4c2f3d12d0f4cbe6f992d945f2fdfc098d Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 27 Jul 2006 17:57:32 -0700 Subject: [PATCH 48/67] [SPARC64]: Fix typo in pgprot_noncached(). The sun4v code sequence was or'ing in the sun4u pte bits by mistake. Signed-off-by: David S. Miller --- include/asm-sparc64/pgtable.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/asm-sparc64/pgtable.h b/include/asm-sparc64/pgtable.h index 03f5bc9b6bec..1ba19eb34ce3 100644 --- a/include/asm-sparc64/pgtable.h +++ b/include/asm-sparc64/pgtable.h @@ -339,7 +339,7 @@ static inline pgprot_t pgprot_noncached(pgprot_t prot) " .section .sun4v_2insn_patch, \"ax\"\n" " .word 661b\n" " andn %0, %4, %0\n" - " or %0, %3, %0\n" + " or %0, %5, %0\n" " .previous\n" : "=r" (val) : "0" (val), "i" (_PAGE_CP_4U | _PAGE_CV_4U), "i" (_PAGE_E_4U), From 96ba989d22de779ca19ca214e2b2e53a4ca86b7b Mon Sep 17 00:00:00 2001 From: Bob Breuer Date: Thu, 27 Jul 2006 22:08:01 -0700 Subject: [PATCH 49/67] [SPARC]: Defer clock_probe to fs_initcall() From: Bob Breuer That way all the of_driver bits will be ready. Signed-off-by: David S. Miller --- arch/sparc/kernel/time.c | 74 +++++++++++++++++++++++----------------- 1 file changed, 42 insertions(+), 32 deletions(-) diff --git a/arch/sparc/kernel/time.c b/arch/sparc/kernel/time.c index 04eb1eab6e3e..845081b01267 100644 --- a/arch/sparc/kernel/time.c +++ b/arch/sparc/kernel/time.c @@ -225,6 +225,32 @@ static __inline__ int has_low_battery(void) return (data1 == data2); /* Was the write blocked? */ } +static void __init mostek_set_system_time(void) +{ + unsigned int year, mon, day, hour, min, sec; + struct mostek48t02 *mregs; + + mregs = (struct mostek48t02 *)mstk48t02_regs; + if(!mregs) { + prom_printf("Something wrong, clock regs not mapped yet.\n"); + prom_halt(); + } + spin_lock_irq(&mostek_lock); + mregs->creg |= MSTK_CREG_READ; + sec = MSTK_REG_SEC(mregs); + min = MSTK_REG_MIN(mregs); + hour = MSTK_REG_HOUR(mregs); + day = MSTK_REG_DOM(mregs); + mon = MSTK_REG_MONTH(mregs); + year = MSTK_CVT_YEAR( MSTK_REG_YEAR(mregs) ); + xtime.tv_sec = mktime(year, mon, day, hour, min, sec); + xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ); + set_normalized_timespec(&wall_to_monotonic, + -xtime.tv_sec, -xtime.tv_nsec); + mregs->creg &= ~MSTK_CREG_READ; + spin_unlock_irq(&mostek_lock); +} + /* Probe for the real time clock chip on Sun4 */ static __inline__ void sun4_clock_probe(void) { @@ -273,6 +299,7 @@ static __inline__ void sun4_clock_probe(void) #endif } +#ifndef CONFIG_SUN4 static int __devinit clock_probe(struct of_device *op, const struct of_device_id *match) { struct device_node *dp = op->node; @@ -307,6 +334,8 @@ static int __devinit clock_probe(struct of_device *op, const struct of_device_id if (mostek_read(mstk48t02_regs + MOSTEK_SEC) & MSTK_STOP) kick_start_clock(); + mostek_set_system_time(); + return 0; } @@ -325,56 +354,37 @@ static struct of_platform_driver clock_driver = { /* Probe for the mostek real time clock chip. */ -static void clock_init(void) +static int __init clock_init(void) { - of_register_driver(&clock_driver, &of_bus_type); + return of_register_driver(&clock_driver, &of_bus_type); } +/* Must be after subsys_initcall() so that busses are probed. Must + * be before device_initcall() because things like the RTC driver + * need to see the clock registers. + */ +fs_initcall(clock_init); +#endif /* !CONFIG_SUN4 */ + void __init sbus_time_init(void) { - unsigned int year, mon, day, hour, min, sec; - struct mostek48t02 *mregs; - -#ifdef CONFIG_SUN4 - int temp; - struct intersil *iregs; -#endif BTFIXUPSET_CALL(bus_do_settimeofday, sbus_do_settimeofday, BTFIXUPCALL_NORM); btfixup(); if (ARCH_SUN4) sun4_clock_probe(); - else - clock_init(); sparc_init_timers(timer_interrupt); #ifdef CONFIG_SUN4 if(idprom->id_machtype == (SM_SUN4 | SM_4_330)) { -#endif - mregs = (struct mostek48t02 *)mstk48t02_regs; - if(!mregs) { - prom_printf("Something wrong, clock regs not mapped yet.\n"); - prom_halt(); - } - spin_lock_irq(&mostek_lock); - mregs->creg |= MSTK_CREG_READ; - sec = MSTK_REG_SEC(mregs); - min = MSTK_REG_MIN(mregs); - hour = MSTK_REG_HOUR(mregs); - day = MSTK_REG_DOM(mregs); - mon = MSTK_REG_MONTH(mregs); - year = MSTK_CVT_YEAR( MSTK_REG_YEAR(mregs) ); - xtime.tv_sec = mktime(year, mon, day, hour, min, sec); - xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ); - set_normalized_timespec(&wall_to_monotonic, - -xtime.tv_sec, -xtime.tv_nsec); - mregs->creg &= ~MSTK_CREG_READ; - spin_unlock_irq(&mostek_lock); -#ifdef CONFIG_SUN4 + mostek_set_system_time(); } else if(idprom->id_machtype == (SM_SUN4 | SM_4_260) ) { /* initialise the intersil on sun4 */ + unsigned int year, mon, day, hour, min, sec; + int temp; + struct intersil *iregs; iregs=intersil_clock; if(!iregs) { From 361934849e9c0418950bedf667732f36337d88b9 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 28 Jul 2006 08:54:59 +0200 Subject: [PATCH 50/67] [PATCH] ide: option to disable cache flushes for buggy drives Some drives claim they support cache flushing, but get seriously confused if you try. Add this option to be able to boot with barriers enabled by default. Signed-off-by: Jens Axboe --- drivers/ide/ide-disk.c | 2 +- drivers/ide/ide.c | 5 ++++- include/linux/ide.h | 1 + 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c index f712e4cfd9dc..7cf3eb023521 100644 --- a/drivers/ide/ide-disk.c +++ b/drivers/ide/ide-disk.c @@ -776,7 +776,7 @@ static void update_ordered(ide_drive_t *drive) * not available so we don't need to recheck that. */ capacity = idedisk_capacity(drive); - barrier = ide_id_has_flush_cache(id) && + barrier = ide_id_has_flush_cache(id) && !drive->noflush && (drive->addressing == 0 || capacity <= (1ULL << 28) || ide_id_has_flush_cache_ext(id)); diff --git a/drivers/ide/ide.c b/drivers/ide/ide.c index 05fbd9298db7..defd4b4bd374 100644 --- a/drivers/ide/ide.c +++ b/drivers/ide/ide.c @@ -1539,7 +1539,7 @@ static int __init ide_setup(char *s) const char *hd_words[] = { "none", "noprobe", "nowerr", "cdrom", "serialize", "autotune", "noautotune", "minus8", "swapdata", "bswap", - "minus11", "remap", "remap63", "scsi", NULL }; + "noflush", "remap", "remap63", "scsi", NULL }; unit = s[2] - 'a'; hw = unit / MAX_DRIVES; unit = unit % MAX_DRIVES; @@ -1578,6 +1578,9 @@ static int __init ide_setup(char *s) case -10: /* "bswap" */ drive->bswap = 1; goto done; + case -11: /* noflush */ + drive->noflush = 1; + goto done; case -12: /* "remap" */ drive->remap_0_to_1 = 1; goto done; diff --git a/include/linux/ide.h b/include/linux/ide.h index dc7abef10965..99620451d958 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -571,6 +571,7 @@ typedef struct ide_drive_s { u8 waiting_for_dma; /* dma currently in progress */ u8 unmask; /* okay to unmask other irqs */ u8 bswap; /* byte swap data */ + u8 noflush; /* don't attempt flushes */ u8 dsc_overlap; /* DSC overlap */ u8 nice1; /* give potential excess bandwidth */ From 0a8348d08677ad77ee353f96eb8745c693a05a13 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 28 Jul 2006 08:58:26 +0200 Subject: [PATCH 51/67] [PATCH] ide: if the id fields looks screwy, disable DMA It's the safer choice. Originally due to a bug in itx821x, but a generally sound thing to do. Signed-off-by: Jens Axboe --- drivers/ide/ide-dma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c index 98918fb6b2ce..7c3a13e1cf64 100644 --- a/drivers/ide/ide-dma.c +++ b/drivers/ide/ide-dma.c @@ -750,7 +750,7 @@ void ide_dma_verbose(ide_drive_t *drive) goto bug_dma_off; printk(", DMA"); } else if (id->field_valid & 1) { - printk(", BUG"); + goto bug_dma_off; } return; bug_dma_off: From 71ef51cc1756d1c56b57c70e7cc27a3559c81ee6 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 28 Jul 2006 09:02:17 +0200 Subject: [PATCH 52/67] [PATCH] it821x: fix ide dma setup bug Only enable dma for a valid speed setting. Signed-off-by: Jens Axboe --- drivers/ide/pci/it821x.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/ide/pci/it821x.c b/drivers/ide/pci/it821x.c index 3cb04424d351..e9bad185968a 100644 --- a/drivers/ide/pci/it821x.c +++ b/drivers/ide/pci/it821x.c @@ -498,9 +498,14 @@ static int config_chipset_for_dma (ide_drive_t *drive) { u8 speed = ide_dma_speed(drive, it821x_ratemask(drive)); - config_it821x_chipset_for_pio(drive, !speed); - it821x_tune_chipset(drive, speed); - return ide_dma_enable(drive); + if (speed) { + config_it821x_chipset_for_pio(drive, 0); + it821x_tune_chipset(drive, speed); + + return ide_dma_enable(drive); + } + + return 0; } /** From a75ad3c27a6ad78c4306cac939938050dcde54f3 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 28 Jul 2006 09:04:09 +0200 Subject: [PATCH 53/67] [PATCH] scsi: kill overeager "not-ready" messages HAL and friends have a tendency to trigger this one all the time. It's not really interesting, so kill it. The vendor kernels all do anyways. Signed-off-by: Jens Axboe --- drivers/scsi/scsi_ioctl.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/scsi/scsi_ioctl.c b/drivers/scsi/scsi_ioctl.c index a89c4115cfba..32293f451669 100644 --- a/drivers/scsi/scsi_ioctl.c +++ b/drivers/scsi/scsi_ioctl.c @@ -110,11 +110,8 @@ static int ioctl_internal_command(struct scsi_device *sdev, char *cmd, sshdr.asc, sshdr.ascq); break; case NOT_READY: /* This happens if there is no disc in drive */ - if (sdev->removable && (cmd[0] != TEST_UNIT_READY)) { - printk(KERN_INFO "Device not ready. Make sure" - " there is a disc in the drive.\n"); + if (sdev->removable) break; - } case UNIT_ATTENTION: if (sdev->removable) { sdev->changed = 1; From 2a293b7d5aa2f0d1e3d87b642f7ac263c2d664e3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 28 Jul 2006 17:04:26 +1000 Subject: [PATCH 54/67] [XFS] All xfs_disk_dquot_t values are (as the name says) disk endian. Before putting them into struct statfs they should be endian-swapped. SGI-PV: 954580 SGI-Modid: xfs-linux-melb:xfs-kern:26550a Signed-off-by: Christoph Hellwig Signed-off-by: Nathan Scott --- fs/xfs/quota/xfs_qm_bhv.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c index e95e99f7168f..f137856c3261 100644 --- a/fs/xfs/quota/xfs_qm_bhv.c +++ b/fs/xfs/quota/xfs_qm_bhv.c @@ -217,17 +217,24 @@ xfs_qm_statvfs( return 0; dp = &dqp->q_core; - limit = dp->d_blk_softlimit ? dp->d_blk_softlimit : dp->d_blk_hardlimit; + limit = dp->d_blk_softlimit ? + be64_to_cpu(dp->d_blk_softlimit) : + be64_to_cpu(dp->d_blk_hardlimit); if (limit && statp->f_blocks > limit) { statp->f_blocks = limit; - statp->f_bfree = (statp->f_blocks > dp->d_bcount) ? - (statp->f_blocks - dp->d_bcount) : 0; + statp->f_bfree = + (statp->f_blocks > be64_to_cpu(dp->d_bcount)) ? + (statp->f_blocks - be64_to_cpu(dp->d_bcount)) : 0; } - limit = dp->d_ino_softlimit ? dp->d_ino_softlimit : dp->d_ino_hardlimit; + + limit = dp->d_ino_softlimit ? + be64_to_cpu(dp->d_ino_softlimit) : + be64_to_cpu(dp->d_ino_hardlimit); if (limit && statp->f_files > limit) { statp->f_files = limit; - statp->f_ffree = (statp->f_files > dp->d_icount) ? - (statp->f_ffree - dp->d_icount) : 0; + statp->f_ffree = + (statp->f_files > be64_to_cpu(dp->d_icount)) ? + (statp->f_ffree - be64_to_cpu(dp->d_icount)) : 0; } xfs_qm_dqput(dqp); From f5faad799475c4058416264f672bb33bf8b5ef41 Mon Sep 17 00:00:00 2001 From: Nathan Scott Date: Fri, 28 Jul 2006 17:04:44 +1000 Subject: [PATCH 55/67] [XFS] Fix remount vs no/barrier options by ensuring we clear unwanted flags from iclog buffers before submitting them for writing. SGI-PV: 954772 SGI-Modid: xfs-linux-melb:xfs-kern:26605a Signed-off-by: Nathan Scott --- fs/xfs/linux-2.6/xfs_buf.h | 4 ++-- fs/xfs/xfs_log.c | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index ceda3a2859d2..7858703ed84c 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -246,8 +246,8 @@ extern void xfs_buf_trace(xfs_buf_t *, char *, void *, void *); #define BUF_BUSY XBF_DONT_BLOCK #define XFS_BUF_BFLAGS(bp) ((bp)->b_flags) -#define XFS_BUF_ZEROFLAGS(bp) \ - ((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI)) +#define XFS_BUF_ZEROFLAGS(bp) ((bp)->b_flags &= \ + ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI|XBF_ORDERED)) #define XFS_BUF_STALE(bp) ((bp)->b_flags |= XFS_B_STALE) #define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XFS_B_STALE) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index e730328636c3..21ac1a67e3e0 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1413,7 +1413,7 @@ xlog_sync(xlog_t *log, ops = iclog->ic_header.h_num_logops; INT_SET(iclog->ic_header.h_num_logops, ARCH_CONVERT, ops); - bp = iclog->ic_bp; + bp = iclog->ic_bp; ASSERT(XFS_BUF_FSPRIVATE2(bp, unsigned long) == (unsigned long)1); XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)2); XFS_BUF_SET_ADDR(bp, BLOCK_LSN(INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT))); @@ -1430,15 +1430,14 @@ xlog_sync(xlog_t *log, } XFS_BUF_SET_PTR(bp, (xfs_caddr_t) &(iclog->ic_header), count); XFS_BUF_SET_FSPRIVATE(bp, iclog); /* save for later */ + XFS_BUF_ZEROFLAGS(bp); XFS_BUF_BUSY(bp); XFS_BUF_ASYNC(bp); /* * Do an ordered write for the log block. - * - * It may not be needed to flush the first split block in the log wrap - * case, but do it anyways to be safe -AK + * Its unnecessary to flush the first split block in the log wrap case. */ - if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) + if (!split && (log->l_mp->m_flags & XFS_MOUNT_BARRIER)) XFS_BUF_ORDERED(bp); ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1); @@ -1460,7 +1459,7 @@ xlog_sync(xlog_t *log, return error; } if (split) { - bp = iclog->ic_log->l_xbuf; + bp = iclog->ic_log->l_xbuf; ASSERT(XFS_BUF_FSPRIVATE2(bp, unsigned long) == (unsigned long)1); XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)2); @@ -1468,6 +1467,7 @@ xlog_sync(xlog_t *log, XFS_BUF_SET_PTR(bp, (xfs_caddr_t)((__psint_t)&(iclog->ic_header)+ (__psint_t)count), split); XFS_BUF_SET_FSPRIVATE(bp, iclog); + XFS_BUF_ZEROFLAGS(bp); XFS_BUF_BUSY(bp); XFS_BUF_ASYNC(bp); if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) From b2ea401bac39e75ebb64038609ed22efbc799905 Mon Sep 17 00:00:00 2001 From: Nathan Scott Date: Fri, 28 Jul 2006 17:05:13 +1000 Subject: [PATCH 56/67] [XFS] Fix a barrier related forced shutdown on mounts with quota enabled. SGI-PV: 912426 SGI-Modid: xfs-linux-melb:xfs-kern:26622a Signed-off-by: Nathan Scott --- fs/xfs/linux-2.6/xfs_super.c | 7 +++++++ fs/xfs/xfs_vfsops.c | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 9bdef9d51900..4754f342a5d3 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -314,6 +314,13 @@ xfs_mountfs_check_barriers(xfs_mount_t *mp) return; } + if (xfs_readonly_buftarg(mp->m_ddev_targp)) { + xfs_fs_cmn_err(CE_NOTE, mp, + "Disabling barriers, underlying device is readonly"); + mp->m_flags &= ~XFS_MOUNT_BARRIER; + return; + } + error = xfs_barrier_test(mp); if (error) { xfs_fs_cmn_err(CE_NOTE, mp, diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c index 6c96391f3f1a..b427d220a169 100644 --- a/fs/xfs/xfs_vfsops.c +++ b/fs/xfs/xfs_vfsops.c @@ -515,7 +515,7 @@ xfs_mount( if (error) goto error2; - if ((mp->m_flags & XFS_MOUNT_BARRIER) && !(vfsp->vfs_flag & VFS_RDONLY)) + if (mp->m_flags & XFS_MOUNT_BARRIER) xfs_mountfs_check_barriers(mp); error = XFS_IOINIT(vfsp, args, flags); From 41ff715abc49324fb2cb20e66bc4e0290cfdbe51 Mon Sep 17 00:00:00 2001 From: Nathan Scott Date: Fri, 28 Jul 2006 17:05:51 +1000 Subject: [PATCH 57/67] [XFS] Ensure bulkstat from an invalid inode number gets caught always with EINVAL. SGI-PV: 953819 SGI-Modid: xfs-linux-melb:xfs-kern:26629a Signed-off-by: Nathan Scott --- fs/xfs/xfs_inode.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 86c1bf0bba9e..1f8ecff8553a 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -334,10 +334,9 @@ xfs_itobp( #if !defined(__KERNEL__) ni = 0; #elif defined(DEBUG) - ni = (imap_flags & XFS_IMAP_BULKSTAT) ? 0 : - (BBTOB(imap.im_len) >> mp->m_sb.sb_inodelog); + ni = BBTOB(imap.im_len) >> mp->m_sb.sb_inodelog; #else /* usual case */ - ni = (imap_flags & XFS_IMAP_BULKSTAT) ? 0 : 1; + ni = 1; #endif for (i = 0; i < ni; i++) { @@ -348,11 +347,15 @@ xfs_itobp( (i << mp->m_sb.sb_inodelog)); di_ok = INT_GET(dip->di_core.di_magic, ARCH_CONVERT) == XFS_DINODE_MAGIC && XFS_DINODE_GOOD_VERSION(INT_GET(dip->di_core.di_version, ARCH_CONVERT)); - if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP, - XFS_RANDOM_ITOBP_INOTOBP))) { + if (unlikely(XFS_TEST_ERROR(!di_ok, mp, + XFS_ERRTAG_ITOBP_INOTOBP, + XFS_RANDOM_ITOBP_INOTOBP))) { + if (imap_flags & XFS_IMAP_BULKSTAT) { + xfs_trans_brelse(tp, bp); + return XFS_ERROR(EINVAL); + } #ifdef DEBUG - if (!(imap_flags & XFS_IMAP_BULKSTAT)) - cmn_err(CE_ALERT, + cmn_err(CE_ALERT, "Device %s - bad inode magic/vsn " "daddr %lld #%d (magic=%x)", XFS_BUFTARG_NAME(mp->m_ddev_targp), From 93853fd0d492524e9172297d8e8b8364dc2c4c59 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 28 Jul 2006 01:09:40 -0700 Subject: [PATCH 58/67] [SUNLANCE]: fix compilation on sparc-UP Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- drivers/net/sunlance.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/sunlance.c b/drivers/net/sunlance.c index 1ef9fd39a79a..0e3fdf7c6dd3 100644 --- a/drivers/net/sunlance.c +++ b/drivers/net/sunlance.c @@ -1537,7 +1537,7 @@ static int __init sparc_lance_init(void) { if ((idprom->id_machtype == (SM_SUN4|SM_4_330)) || (idprom->id_machtype == (SM_SUN4|SM_4_470))) { - memset(&sun4_sdev, 0, sizeof(sdev)); + memset(&sun4_sdev, 0, sizeof(struct sbus_dev)); sun4_sdev.reg_addrs[0].phys_addr = sun4_eth_physaddr; sun4_sdev.irqs[0] = 6; return sparc_lance_probe_one(&sun4_sdev, NULL, NULL); @@ -1547,16 +1547,16 @@ static int __init sparc_lance_init(void) static int __exit sunlance_sun4_remove(void) { - struct lance_private *lp = dev_get_drvdata(&sun4_sdev->dev); + struct lance_private *lp = dev_get_drvdata(&sun4_sdev.ofdev.dev); struct net_device *net_dev = lp->dev; unregister_netdevice(net_dev); - lance_free_hwresources(root_lance_dev); + lance_free_hwresources(lp); free_netdev(net_dev); - dev_set_drvdata(&sun4_sdev->dev, NULL); + dev_set_drvdata(&sun4_sdev.ofdev.dev, NULL); return 0; } From facf014792093d95e308b5d6ce9bc86d3c90e5b1 Mon Sep 17 00:00:00 2001 From: Chuck Ebbert <76306.1226@compuserve.com> Date: Tue, 25 Jul 2006 16:15:16 -0400 Subject: [PATCH 59/67] [PATCH] i386: switch_to(): misplaced parentheses Recent changes in i386 __switch_to() have a misplaced closing parenthesis causing an unlikely() to terminate early. Signed-off-by: Chuck Ebbert <76306.1226@compuserve.com> Acked-by: Steven Rostedt Signed-off-by: Linus Torvalds --- arch/i386/kernel/process.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index 923bb292f47f..8657c739656a 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -690,8 +690,8 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas /* * Now maybe handle debug registers and/or IO bitmaps */ - if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW)) - || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) + if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW) + || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP))) __switch_to_xtra(next_p, tss); disable_tsc(prev_p, next_p); From d5a2601734bcc740ee78dc4cb0c56b5687da7bd9 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 28 Jul 2006 14:44:42 +0200 Subject: [PATCH 60/67] [PATCH] i386/x86-64: Add user_mode checks to profile_pc for oprofile Fixes a obscure user space triggerable crash during oprofiling. Oprofile calls profile_pc from NMIs even when user_mode(regs) is not true and the program counter is inside the kernel lock section. This opens a race - when a user program jumps to a kernel lock address and a NMI happens before the illegal page fault exception is raised and the program has a unmapped esp or ebp then the kernel could oops. NMIs have a higher priority than exceptions so that could happen. Add user_mode checks to i386/x86-64 profile_pc to prevent that. Cc: John Levon Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/i386/kernel/time.c | 2 +- arch/x86_64/kernel/time.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c index 8705c0f05788..edd00f6cee37 100644 --- a/arch/i386/kernel/time.c +++ b/arch/i386/kernel/time.c @@ -135,7 +135,7 @@ unsigned long profile_pc(struct pt_regs *regs) { unsigned long pc = instruction_pointer(regs); - if (in_lock_functions(pc)) + if (!user_mode_vm(regs) && in_lock_functions(pc)) return *(unsigned long *)(regs->ebp + 4); return pc; diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index b9ff75992c16..e0341c6808e5 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -193,7 +193,7 @@ unsigned long profile_pc(struct pt_regs *regs) is just accounted to the spinlock function. Better would be to write these functions in assembler again and check exactly. */ - if (in_lock_functions(pc)) { + if (!user_mode(regs) && in_lock_functions(pc)) { char *v = *(char **)regs->rsp; if ((v >= _stext && v <= _etext) || (v >= _sinittext && v <= _einittext) || From 0e92da4acb763272c6060f0b14adc2377b627d07 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 28 Jul 2006 14:44:45 +0200 Subject: [PATCH 61/67] [PATCH] x86_64: Don't clobber r8-r11 in int 0x80 handler When int 0x80 is called from long mode r8-r11 would leak out of the kernel (or rather they would be filled with some values from the kernel stack). I don't think it's a security issue because the values come from the fixed stack frame which should be near always user registers from a previous interrupt. Still better fix it. Longer term the register save macros need to be cleaned up to avoid such mistakes in the future. Original analysis from Richard Brunner, fix by me. Cc: Richard.Brunner@amd.com Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/ia32/ia32entry.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S index 9b5bb413a6e9..5d4a7d125ed0 100644 --- a/arch/x86_64/ia32/ia32entry.S +++ b/arch/x86_64/ia32/ia32entry.S @@ -103,7 +103,7 @@ ENTRY(ia32_sysenter_target) pushq %rax CFI_ADJUST_CFA_OFFSET 8 cld - SAVE_ARGS 0,0,1 + SAVE_ARGS 0,0,0 /* no need to do an access_ok check here because rbp has been 32bit zero extended */ 1: movl (%rbp),%r9d From a4045dff782a8692637c24a0222120082c887caa Mon Sep 17 00:00:00 2001 From: bibo mao Date: Fri, 28 Jul 2006 14:44:48 +0200 Subject: [PATCH 62/67] [PATCH] x86_64: Enlarge debug stack for nested kprobes In x86_64 platform, INT1 and INT3 trap stack is IST stack called DEBUG_STACK, when INT1/INT3 trap happens, system will switch to DEBUG_STACK by hardware. Current DEBUG_STACK size is 4K, when int1/int3 trap happens, kernel will minus current DEBUG_STACK IST value by 4k. But if int3/int1 trap is nested, it will destroy other vector's IST stack. This patch modifies this, it sets DEBUG_STACK size as 8K and allows two level of nested int1/int3 trap. Kprobe DEBUG_STACK may be nested, because kprobe handler may be probed by other kprobes. Thanks jbeulich for pointing out error in the first patch. [AK: nested kprobes are pretty dubious. Hopefully one nest will be enough. This will cost 8K per CPU (4K more than before)] Signed-off-by: bibo, mao Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- include/asm-x86_64/page.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/asm-x86_64/page.h b/include/asm-x86_64/page.h index f7bf875aae40..10f346165cab 100644 --- a/include/asm-x86_64/page.h +++ b/include/asm-x86_64/page.h @@ -19,7 +19,7 @@ #define EXCEPTION_STACK_ORDER 0 #define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER) -#define DEBUG_STACK_ORDER EXCEPTION_STACK_ORDER +#define DEBUG_STACK_ORDER (EXCEPTION_STACK_ORDER + 1) #define DEBUG_STKSZ (PAGE_SIZE << DEBUG_STACK_ORDER) #define IRQSTACK_ORDER 2 From b13761ecd1d9977d2083da243e051e9f29097aef Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 28 Jul 2006 14:44:51 +0200 Subject: [PATCH 63/67] [PATCH] x86_64: Dump leftover backtrace entries when dwarf2 unwinder got stuck The dwarf2 unwinder currently often gets stuck because a lot of assembly code doesn't have proper dwarf2 annotiation yet. This currently often happens with __down. Should fix this by adding proper dwarf2 annotation to all inline assembly. However until that's done we need a quick fix for 2.6.18 to avoid incomplete backtraces. So when this happens dump the rest of the stack with the old unwinder instead of silently not dumping it. There was already a optional "both" mode that dumped both, but that was too ugly. I also clarified the headers for the different backtraces a bit. Also add a clear error message for missing dwarf2 annotation that people can work on. And I removed a dead variable left over from Ingo's changes. Cc: mingo@elte.hu Cc: jbeulich@novell.com Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/traps.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index eb39a2775236..f7a9d1421078 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -254,7 +254,6 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s { const unsigned cpu = safe_smp_processor_id(); unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr; - int i = 11; unsigned used = 0; printk("\nCall Trace:\n"); @@ -275,11 +274,20 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s if (unwind_init_blocked(&info, tsk) == 0) unw_ret = show_trace_unwind(&info, NULL); } - if (unw_ret > 0) { - if (call_trace > 0) + if (unw_ret > 0 && !arch_unw_user_mode(&info)) { +#ifdef CONFIG_STACK_UNWIND + unsigned long rip = info.regs.rip; + print_symbol("DWARF2 unwinder stuck at %s\n", rip); + if (call_trace == 1) { + printk("Leftover inexact backtrace:\n"); + stack = (unsigned long *)info.regs.rsp; + } else if (call_trace > 1) return; - printk("Legacy call trace:"); - i = 18; + else + printk("Full inexact backtrace again:\n"); +#else + printk("Inexact backtrace:\n"); +#endif } } @@ -1118,8 +1126,10 @@ static int __init call_trace_setup(char *s) call_trace = -1; else if (strcmp(s, "both") == 0) call_trace = 0; - else if (strcmp(s, "new") == 0) + else if (strcmp(s, "newfallback") == 0) call_trace = 1; + else if (strcmp(s, "new") == 0) + call_trace = 2; return 1; } __setup("call_trace=", call_trace_setup); From b783fd925cdd56d24d164e5bdcb072f2a67aedf4 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 28 Jul 2006 14:44:54 +0200 Subject: [PATCH 64/67] [PATCH] x86_64: Document backtracer selection options Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- Documentation/x86_64/boot-options.txt | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Documentation/x86_64/boot-options.txt b/Documentation/x86_64/boot-options.txt index 6887d44d2661..6da24e7a56cb 100644 --- a/Documentation/x86_64/boot-options.txt +++ b/Documentation/x86_64/boot-options.txt @@ -238,6 +238,13 @@ Debugging pagefaulttrace Dump all page faults. Only useful for extreme debugging and will create a lot of output. + call_trace=[old|both|newfallback|new] + old: use old inexact backtracer + new: use new exact dwarf2 unwinder + both: print entries from both + newfallback: use new unwinder but fall back to old if it gets + stuck (default) + Misc noreplacement Don't replace instructions with more appropriate ones From c97d20a6c51067a38f53680d9609b4cf2867d077 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 28 Jul 2006 14:44:57 +0200 Subject: [PATCH 65/67] [PATCH] i386: Do backtrace fallback too Similar patch to earlier x86-64 patch. When the dwarf2 unwinder fails dump the left over stack with the old unwinder. Also some clarifications in the headers. Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/i386/kernel/traps.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index 313ac1f7dc5a..3facc8fcb91e 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -187,10 +187,21 @@ static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, if (unwind_init_blocked(&info, task) == 0) unw_ret = show_trace_unwind(&info, log_lvl); } - if (unw_ret > 0) { - if (call_trace > 0) + if (unw_ret > 0 && !arch_unw_user_mode(&info)) { +#ifdef CONFIG_STACK_UNWIND + print_symbol("DWARF2 unwinder stuck at %s\n", + UNW_PC(info.regs)); + if (call_trace == 1) { + printk("Leftover inexact backtrace:\n"); + if (UNW_SP(info.regs)) + stack = (void *)UNW_SP(info.regs); + } else if (call_trace > 1) return; - printk("%sLegacy call trace:\n", log_lvl); + else + printk("Full inexact backtrace again:\n"); +#else + printk("Inexact backtrace:\n"); +#endif } } From 627371d73cdd04ed23fe098755b4f855138ad9e0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 29 Jul 2006 05:16:20 +0200 Subject: [PATCH 66/67] [PATCH] pi-futex: robust-futex exit crash fix Fix pi_state->list handling bugs: list handling mishap, locking error. Plus add more debug checks and fix a few style issues i noticed while debugging this. (reported by Ulrich Drepper and Jakub Jelinek.) Signed-off-by: Ingo Molnar Signed-off-by: Linus Torvalds --- kernel/futex.c | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index cf0c8e21d1ab..f59003b1d8f9 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -415,15 +415,15 @@ static struct task_struct * futex_find_get_task(pid_t pid) */ void exit_pi_state_list(struct task_struct *curr) { - struct futex_hash_bucket *hb; struct list_head *next, *head = &curr->pi_state_list; struct futex_pi_state *pi_state; + struct futex_hash_bucket *hb; union futex_key key; /* * We are a ZOMBIE and nobody can enqueue itself on * pi_state_list anymore, but we have to be careful - * versus waiters unqueueing themselfs + * versus waiters unqueueing themselves: */ spin_lock_irq(&curr->pi_lock); while (!list_empty(head)) { @@ -431,21 +431,24 @@ void exit_pi_state_list(struct task_struct *curr) next = head->next; pi_state = list_entry(next, struct futex_pi_state, list); key = pi_state->key; + hb = hash_futex(&key); spin_unlock_irq(&curr->pi_lock); - hb = hash_futex(&key); spin_lock(&hb->lock); spin_lock_irq(&curr->pi_lock); + /* + * We dropped the pi-lock, so re-check whether this + * task still owns the PI-state: + */ if (head->next != next) { spin_unlock(&hb->lock); continue; } - list_del_init(&pi_state->list); - WARN_ON(pi_state->owner != curr); - + WARN_ON(list_empty(&pi_state->list)); + list_del_init(&pi_state->list); pi_state->owner = NULL; spin_unlock_irq(&curr->pi_lock); @@ -470,7 +473,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) head = &hb->chain; list_for_each_entry_safe(this, next, head, list) { - if (match_futex (&this->key, &me->key)) { + if (match_futex(&this->key, &me->key)) { /* * Another waiter already exists - bump up * the refcount and return its pi_state: @@ -482,6 +485,8 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) if (unlikely(!pi_state)) return -EINVAL; + WARN_ON(!atomic_read(&pi_state->refcount)); + atomic_inc(&pi_state->refcount); me->pi_state = pi_state; @@ -510,6 +515,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) pi_state->key = me->key; spin_lock_irq(&p->pi_lock); + WARN_ON(!list_empty(&pi_state->list)); list_add(&pi_state->list, &p->pi_state_list); pi_state->owner = p; spin_unlock_irq(&p->pi_lock); @@ -584,9 +590,17 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) if (curval != uval) return -EINVAL; - list_del_init(&pi_state->owner->pi_state_list); + spin_lock_irq(&pi_state->owner->pi_lock); + WARN_ON(list_empty(&pi_state->list)); + list_del_init(&pi_state->list); + spin_unlock_irq(&pi_state->owner->pi_lock); + + spin_lock_irq(&new_owner->pi_lock); + WARN_ON(!list_empty(&pi_state->list)); list_add(&pi_state->list, &new_owner->pi_state_list); pi_state->owner = new_owner; + spin_unlock_irq(&new_owner->pi_lock); + rt_mutex_unlock(&pi_state->pi_mutex); return 0; @@ -1236,6 +1250,7 @@ static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock, /* Owner died? */ if (q.pi_state->owner != NULL) { spin_lock_irq(&q.pi_state->owner->pi_lock); + WARN_ON(list_empty(&q.pi_state->list)); list_del_init(&q.pi_state->list); spin_unlock_irq(&q.pi_state->owner->pi_lock); } else @@ -1244,6 +1259,7 @@ static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock, q.pi_state->owner = current; spin_lock_irq(¤t->pi_lock); + WARN_ON(!list_empty(&q.pi_state->list)); list_add(&q.pi_state->list, ¤t->pi_state_list); spin_unlock_irq(¤t->pi_lock); From e3f2ddeac718c768fdac4b7fe69d465172f788a8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 29 Jul 2006 05:17:57 +0200 Subject: [PATCH 67/67] [PATCH] pi-futex: robust-futex exit Fix robust PI-futexes to be properly unlocked on unexpected exit. For this to work the kernel has to know whether a futex is a PI or a non-PI one, because the semantics are different. Since the space in relevant glibc data structures is extremely scarce, the best solution is to encode the 'PI' information in bit 0 of the robust list pointer. Existing (non-PI) glibc robust futexes have this bit always zero, so the ABI is kept. New glibc with PI-robust-futexes will set this bit. Further fixes from Thomas Gleixner Signed-off-by: Ingo Molnar Signed-off-by: Ulrich Drepper Signed-off-by: Thomas Gleixner Signed-off-by: Linus Torvalds --- include/linux/futex.h | 3 +- kernel/futex.c | 91 +++++++++++++++++++++++++++++-------------- kernel/futex_compat.c | 34 +++++++++++----- 3 files changed, 89 insertions(+), 39 deletions(-) diff --git a/include/linux/futex.h b/include/linux/futex.h index 34c3a215f2cd..d097b5b72bc6 100644 --- a/include/linux/futex.h +++ b/include/linux/futex.h @@ -96,7 +96,8 @@ struct robust_list_head { long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout, u32 __user *uaddr2, u32 val2, u32 val3); -extern int handle_futex_death(u32 __user *uaddr, struct task_struct *curr); +extern int +handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi); #ifdef CONFIG_FUTEX extern void exit_robust_list(struct task_struct *curr); diff --git a/kernel/futex.c b/kernel/futex.c index f59003b1d8f9..dda2049692a2 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -495,10 +495,13 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) } /* - * We are the first waiter - try to look up the real owner and - * attach the new pi_state to it: + * We are the first waiter - try to look up the real owner and attach + * the new pi_state to it, but bail out when the owner died bit is set + * and TID = 0: */ pid = uval & FUTEX_TID_MASK; + if (!pid && (uval & FUTEX_OWNER_DIED)) + return -ESRCH; p = futex_find_get_task(pid); if (!p) return -ESRCH; @@ -579,16 +582,17 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) * kept enabled while there is PI state around. We must also * preserve the owner died bit.) */ - newval = (uval & FUTEX_OWNER_DIED) | FUTEX_WAITERS | new_owner->pid; + if (!(uval & FUTEX_OWNER_DIED)) { + newval = FUTEX_WAITERS | new_owner->pid; - inc_preempt_count(); - curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); - dec_preempt_count(); - - if (curval == -EFAULT) - return -EFAULT; - if (curval != uval) - return -EINVAL; + inc_preempt_count(); + curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); + dec_preempt_count(); + if (curval == -EFAULT) + return -EFAULT; + if (curval != uval) + return -EINVAL; + } spin_lock_irq(&pi_state->owner->pi_lock); WARN_ON(list_empty(&pi_state->list)); @@ -1443,9 +1447,11 @@ static int futex_unlock_pi(u32 __user *uaddr) * again. If it succeeds then we can return without waking * anyone else up: */ - inc_preempt_count(); - uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0); - dec_preempt_count(); + if (!(uval & FUTEX_OWNER_DIED)) { + inc_preempt_count(); + uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0); + dec_preempt_count(); + } if (unlikely(uval == -EFAULT)) goto pi_faulted; @@ -1478,9 +1484,11 @@ static int futex_unlock_pi(u32 __user *uaddr) /* * No waiters - kernel unlocks the futex: */ - ret = unlock_futex_pi(uaddr, uval); - if (ret == -EFAULT) - goto pi_faulted; + if (!(uval & FUTEX_OWNER_DIED)) { + ret = unlock_futex_pi(uaddr, uval); + if (ret == -EFAULT) + goto pi_faulted; + } out_unlock: spin_unlock(&hb->lock); @@ -1699,9 +1707,9 @@ sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr, * Process a futex-list entry, check whether it's owned by the * dying task, and do notification if so: */ -int handle_futex_death(u32 __user *uaddr, struct task_struct *curr) +int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi) { - u32 uval, nval; + u32 uval, nval, mval; retry: if (get_user(uval, uaddr)) @@ -1718,20 +1726,44 @@ int handle_futex_death(u32 __user *uaddr, struct task_struct *curr) * thread-death.) The rest of the cleanup is done in * userspace. */ - nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, - uval | FUTEX_OWNER_DIED); + mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; + nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval); + if (nval == -EFAULT) return -1; if (nval != uval) goto retry; - if (uval & FUTEX_WAITERS) - futex_wake(uaddr, 1); + /* + * Wake robust non-PI futexes here. The wakeup of + * PI futexes happens in exit_pi_state(): + */ + if (!pi) { + if (uval & FUTEX_WAITERS) + futex_wake(uaddr, 1); + } } return 0; } +/* + * Fetch a robust-list pointer. Bit 0 signals PI futexes: + */ +static inline int fetch_robust_entry(struct robust_list __user **entry, + struct robust_list __user **head, int *pi) +{ + unsigned long uentry; + + if (get_user(uentry, (unsigned long *)head)) + return -EFAULT; + + *entry = (void *)(uentry & ~1UL); + *pi = uentry & 1; + + return 0; +} + /* * Walk curr->robust_list (very carefully, it's a userspace list!) * and mark any locks found there dead, and notify any waiters. @@ -1742,14 +1774,14 @@ void exit_robust_list(struct task_struct *curr) { struct robust_list_head __user *head = curr->robust_list; struct robust_list __user *entry, *pending; - unsigned int limit = ROBUST_LIST_LIMIT; + unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; unsigned long futex_offset; /* * Fetch the list head (which was registered earlier, via * sys_set_robust_list()): */ - if (get_user(entry, &head->list.next)) + if (fetch_robust_entry(&entry, &head->list.next, &pi)) return; /* * Fetch the relative futex offset: @@ -1760,10 +1792,11 @@ void exit_robust_list(struct task_struct *curr) * Fetch any possibly pending lock-add first, and handle it * if it exists: */ - if (get_user(pending, &head->list_op_pending)) + if (fetch_robust_entry(&pending, &head->list_op_pending, &pip)) return; + if (pending) - handle_futex_death((void *)pending + futex_offset, curr); + handle_futex_death((void *)pending + futex_offset, curr, pip); while (entry != &head->list) { /* @@ -1772,12 +1805,12 @@ void exit_robust_list(struct task_struct *curr) */ if (entry != pending) if (handle_futex_death((void *)entry + futex_offset, - curr)) + curr, pi)) return; /* * Fetch the next entry in the list: */ - if (get_user(entry, &entry->next)) + if (fetch_robust_entry(&entry, &entry->next, &pi)) return; /* * Avoid excessively long or circular lists: diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index d1d92b441fb7..d1aab1a452cc 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -12,6 +12,23 @@ #include + +/* + * Fetch a robust-list pointer. Bit 0 signals PI futexes: + */ +static inline int +fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, + compat_uptr_t *head, int *pi) +{ + if (get_user(*uentry, head)) + return -EFAULT; + + *entry = compat_ptr((*uentry) & ~1); + *pi = (unsigned int)(*uentry) & 1; + + return 0; +} + /* * Walk curr->robust_list (very carefully, it's a userspace list!) * and mark any locks found there dead, and notify any waiters. @@ -22,17 +39,16 @@ void compat_exit_robust_list(struct task_struct *curr) { struct compat_robust_list_head __user *head = curr->compat_robust_list; struct robust_list __user *entry, *pending; + unsigned int limit = ROBUST_LIST_LIMIT, pi; compat_uptr_t uentry, upending; - unsigned int limit = ROBUST_LIST_LIMIT; compat_long_t futex_offset; /* * Fetch the list head (which was registered earlier, via * sys_set_robust_list()): */ - if (get_user(uentry, &head->list.next)) + if (fetch_robust_entry(&uentry, &entry, &head->list.next, &pi)) return; - entry = compat_ptr(uentry); /* * Fetch the relative futex offset: */ @@ -42,11 +58,11 @@ void compat_exit_robust_list(struct task_struct *curr) * Fetch any possibly pending lock-add first, and handle it * if it exists: */ - if (get_user(upending, &head->list_op_pending)) + if (fetch_robust_entry(&upending, &pending, + &head->list_op_pending, &pi)) return; - pending = compat_ptr(upending); if (upending) - handle_futex_death((void *)pending + futex_offset, curr); + handle_futex_death((void *)pending + futex_offset, curr, pi); while (compat_ptr(uentry) != &head->list) { /* @@ -55,15 +71,15 @@ void compat_exit_robust_list(struct task_struct *curr) */ if (entry != pending) if (handle_futex_death((void *)entry + futex_offset, - curr)) + curr, pi)) return; /* * Fetch the next entry in the list: */ - if (get_user(uentry, (compat_uptr_t *)&entry->next)) + if (fetch_robust_entry(&uentry, &entry, + (compat_uptr_t *)&entry->next, &pi)) return; - entry = compat_ptr(uentry); /* * Avoid excessively long or circular lists: */