From mboxrd@z Thu Jan 1 00:00:00 1970 From: Ralph Campbell Subject: [PATCH 20/53] IB/qib: Add qib_init.c Date: Thu, 19 Nov 2009 15:38:41 -0800 Message-ID: <20091119233841.30356.74752.stgit@chromite.mv.qlogic.com> References: <20091119233655.30356.57433.stgit@chromite.mv.qlogic.com> Mime-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: 7bit Return-path: In-Reply-To: <20091119233655.30356.57433.stgit-/vjeY7uYZjrPXfVEPVhPGq6RkeBMCJyt@public.gmane.org> Sender: linux-rdma-owner-u79uwXL29TY76Z2rM5mHXA@public.gmane.org To: Roland Dreier Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org List-Id: linux-rdma@vger.kernel.org creates the qib_init.c file. Signed-off-by: Ralph Campbell --- drivers/infiniband/hw/qib/qib_init.c | 1618 ++++++++++++++++++++++++++++++++++ 1 files changed, 1618 insertions(+), 0 deletions(-) create mode 100644 drivers/infiniband/hw/qib/qib_init.c diff --git a/drivers/infiniband/hw/qib/qib_init.c b/drivers/infiniband/hw/qib/qib_init.c new file mode 100644 index 0000000..dbf2b4b --- /dev/null +++ b/drivers/infiniband/hw/qib/qib_init.c @@ -0,0 +1,1618 @@ +/* + * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include + +#include "qib.h" +#include "qib_common.h" + +/* + * min buffers we want to have per context, after driver + */ +#define QIB_MIN_USER_CTXT_BUFCNT 7 + +#define QLOGIC_IB_R_SOFTWARE_MASK 0xFF +#define QLOGIC_IB_R_SOFTWARE_SHIFT 24 +#define QLOGIC_IB_R_EMULATOR_MASK (1ULL<<62) + +/* + * Number of ctxts we are configured to use (to allow for more pio + * buffers per ctxt, etc.) Zero means use chip value. + */ +ushort qib_cfgctxts; +module_param_named(cfgctxts, qib_cfgctxts, ushort, S_IRUGO); +MODULE_PARM_DESC(cfgctxts, "Set max number of contexts to use"); + +/* + * If set, do not write to any regs if avoidable, hack to allow + * check for deranged default register values. + */ +ushort qib_mini_init; +module_param_named(mini_init, qib_mini_init, ushort, S_IRUGO); +MODULE_PARM_DESC(mini_init, "If set, do minimal diag init"); + +unsigned qib_n_krcv_queues; +module_param_named(krcvqs, qib_n_krcv_queues, uint, S_IRUGO); +MODULE_PARM_DESC(krcvqs, "number of kernel receive queues per IB port"); + +/* + * qib_wc_pat parameter: + * 0 is WC via MTRR + * 1 is WC via PAT + * If PAT initialization fails, code reverts back to MTRR + */ +unsigned qib_wc_pat = 1; /* default (1) is to use PAT, not MTRR */ +module_param_named(wc_pat, qib_wc_pat, uint, S_IRUGO); +MODULE_PARM_DESC(wc_pat, "enable write-combining via PAT mechanism"); + +static void verify_interrupt(unsigned long); + +static struct idr qib_unit_table; + +/* set number of contexts we'll actually use */ +void qib_set_ctxtcnt(struct qib_devdata *dd) +{ + if (!qib_cfgctxts) + dd->cfgctxts = dd->ctxtcnt; + else if (qib_cfgctxts < dd->num_pports) { + dd->cfgctxts = dd->ctxtcnt; + qib_dbg("Configured to use too few ctxts (%u); using %u\n", + qib_cfgctxts, dd->cfgctxts); + } else if (qib_cfgctxts <= dd->ctxtcnt) { + dd->cfgctxts = qib_cfgctxts; + qib_cdbg(INIT, "Configured to use %u ctxts\n", dd->cfgctxts); + } else { + dd->cfgctxts = dd->ctxtcnt; + qib_dbg("Configured to use too many ctxts (%u); using %u\n", + qib_cfgctxts, dd->cfgctxts); + } +} + +/* + * Common code for creating the receive context array. + */ +int qib_create_ctxts(struct qib_devdata *dd) +{ + unsigned i; + int ret; + + /* + * Allocate full ctxtcnt array, rather than just cfgctxts, because + * cleanup iterates across all possible ctxts. + */ + dd->rcd = kzalloc(sizeof(*dd->rcd) * dd->ctxtcnt, GFP_KERNEL); + if (!dd->rcd) { + qib_dev_err(dd, "Unable to allocate ctxtdata array, " + "failing\n"); + ret = -ENOMEM; + goto done; + } + + /* create (one or more) kctxt */ + for (i = 0; i < dd->first_user_ctxt; ++i) { + struct qib_pportdata *ppd; + struct qib_ctxtdata *rcd; + + if (dd->skip_kctxt_mask & (1 << i)) + continue; + + ppd = dd->pport + (i % dd->num_pports); + rcd = qib_create_ctxtdata(ppd, i); + if (!rcd) { + qib_dev_err(dd, "Unable to allocate ctxtdata" + " for Kernel ctxt, failing\n"); + ret = -ENOMEM; + goto done; + } + rcd->pkeys[0] = QIB_DEFAULT_P_KEY; + rcd->seq_cnt = 1; + } + ret = 0; +done: + return ret; +} + +/* + * Common code for user and kernel context setup. + */ +struct qib_ctxtdata *qib_create_ctxtdata(struct qib_pportdata *ppd, u32 ctxt) +{ + struct qib_devdata *dd = ppd->dd; + struct qib_ctxtdata *rcd; + + rcd = kzalloc(sizeof(*rcd), GFP_KERNEL); + if (rcd) { + INIT_LIST_HEAD(&rcd->qp_wait_list); + rcd->ppd = ppd; + rcd->dd = dd; + rcd->cnt = 1; + rcd->ctxt = ctxt; + dd->rcd[ctxt] = rcd; + + dd->f_init_ctxt(rcd); + + /* + * To avoid wasting a lot of memory, we allocate 32KB chunks + * of physically contiguous memory, advance through it until + * used up and then allocate more. Of course, we need + * memory to store those extra pointers, now. 32KB seems to + * be the most that is "safe" under memory pressure + * (creating large files and then copying them over + * NFS while doing lots of MPI jobs). The OOM killer can + * get invoked, even though we say we can sleep and this can + * cause significant system problems.... + */ + rcd->rcvegrbuf_size = 0x8000; + rcd->rcvegrbufs_perchunk = + rcd->rcvegrbuf_size / dd->rcvegrbufsize; + rcd->rcvegrbuf_chunks = (rcd->rcvegrcnt + + rcd->rcvegrbufs_perchunk - 1) / + rcd->rcvegrbufs_perchunk; + } + return rcd; +} + +/* + * Common code for initializing the physical port structure. + */ +void qib_init_pportdata(struct qib_pportdata *ppd, struct qib_devdata *dd, + u8 hw_pidx, u8 port) +{ + ppd->dd = dd; + ppd->hw_pidx = hw_pidx; + ppd->port = port; /* IB port number, not index */ + + spin_lock_init(&ppd->sdma_lock); + spin_lock_init(&ppd->lflags_lock); + init_waitqueue_head(&ppd->state_wait); +} + +static int init_pioavailregs(struct qib_devdata *dd) +{ + int ret, pidx; + u64 *status_page; + + dd->pioavailregs_dma = dma_alloc_coherent( + &dd->pcidev->dev, PAGE_SIZE, &dd->pioavailregs_phys, + GFP_KERNEL); + if (!dd->pioavailregs_dma) { + qib_dev_err(dd, "failed to allocate PIOavail reg area " + "in memory\n"); + ret = -ENOMEM; + goto done; + } + + /* + * We really want L2 cache aligned, but for current CPUs of + * interest, they are the same. + */ + status_page = (u64 *) + ((char *) dd->pioavailregs_dma + + ((2 * L1_CACHE_BYTES + + dd->pioavregs * sizeof(u64)) & ~L1_CACHE_BYTES)); + /* device status comes first, for backwards compatibility */ + dd->devstatusp = status_page; + *status_page++ = 0; + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + dd->pport[pidx].statusp = status_page; + *status_page++ = 0; + } + + /* + * Setup buffer to hold freeze and other messages, accessible to + * apps, following statusp. This is per-unit, not per port. + */ + dd->freezemsg = (char *) status_page; + *dd->freezemsg = 0; + /* length of msg buffer is "whatever is left" */ + ret = (char *) status_page - (char *) dd->pioavailregs_dma; + dd->freezelen = PAGE_SIZE - ret; + + ret = 0; + +done: + return ret; +} + +/** + * init_shadow_tids - allocate the shadow TID array + * @dd: the qlogic_ib device + * + * allocate the shadow TID array, so we can qib_munlock previous + * entries. It may make more sense to move the pageshadow to the + * ctxt data structure, so we only allocate memory for ctxts actually + * in use, since we at 8k per ctxt, now. + * We don't want failures here to prevent use of the driver/chip, + * so no return value. + */ +static void init_shadow_tids(struct qib_devdata *dd) +{ + struct page **pages; + dma_addr_t *addrs; + + pages = vmalloc(dd->cfgctxts * dd->rcvtidcnt * sizeof(struct page *)); + if (!pages) { + qib_dev_err(dd, "failed to allocate shadow page * " + "array, no expected sends!\n"); + goto bail; + } + + addrs = vmalloc(dd->cfgctxts * dd->rcvtidcnt * sizeof(dma_addr_t)); + if (!addrs) { + qib_dev_err(dd, "failed to allocate shadow dma handle " + "array, no expected sends!\n"); + goto bail_free; + } + + memset(pages, 0, dd->cfgctxts * dd->rcvtidcnt * sizeof(struct page *)); + memset(addrs, 0, dd->cfgctxts * dd->rcvtidcnt * sizeof(dma_addr_t)); + + dd->pageshadow = pages; + dd->physshadow = addrs; + return; + +bail_free: + vfree(pages); +bail: + dd->pageshadow = NULL; +} + +/* + * Do initialization for device that is only needed on + * first detect, not on resets. + */ +static int loadtime_init(struct qib_devdata *dd) +{ + int ret = 0; + + qib_cdbg(VERBOSE, "Revision %Lx\n", dd->revision); + + if (((dd->revision >> QLOGIC_IB_R_SOFTWARE_SHIFT) & + QLOGIC_IB_R_SOFTWARE_MASK) != QIB_CHIP_SWVERSION) { + qib_dev_err(dd, "Driver only handles version %d, " + "chip swversion is %d (%llx), failng\n", + QIB_CHIP_SWVERSION, + (int)(dd->revision >> + QLOGIC_IB_R_SOFTWARE_SHIFT) & + QLOGIC_IB_R_SOFTWARE_MASK, + (unsigned long long) dd->revision); + ret = -ENOSYS; + goto done; + } + + if (dd->revision & QLOGIC_IB_R_EMULATOR_MASK) + qib_devinfo(dd->pcidev, "%s", dd->boardversion); + else + qib_cdbg(INIT, "%s", dd->boardversion); + + spin_lock_init(&dd->pioavail_lock); + spin_lock_init(&dd->sendctrl_lock); + spin_lock_init(&dd->uctxt_lock); + spin_lock_init(&dd->qib_diag_trans_lock); + spin_lock_init(&dd->eep_st_lock); + mutex_init(&dd->eep_lock); + + if (qib_mini_init) + goto done; + + ret = init_pioavailregs(dd); + init_shadow_tids(dd); + + qib_get_eeprom_info(dd); + + /* setup time (don't start yet) to verify we got interrupt */ + init_timer(&dd->intrchk_timer); + dd->intrchk_timer.function = verify_interrupt; + dd->intrchk_timer.data = (unsigned long) dd; + +done: + return ret; +} + +/** + * init_after_reset - re-initialize after a reset + * @dd: the qlogic_ib device + * + * sanity check at least some of the values after reset, and + * ensure no receive or transmit (explictly, in case reset + * failed + */ +static int init_after_reset(struct qib_devdata *dd) +{ + int i; + + /* + * Ensure chip does no sends or receives, tail updates, or + * pioavail updates while we re-initialize. This is mostly + * for the driver data structures, not chip registers. + */ + for (i = 0; i < dd->num_pports; ++i) { + /* + * ctxt == -1 means "all contexts". Only really safe for + * _dis_abling things, as here. + */ + dd->f_rcvctrl(dd->pport + i, QIB_RCVCTRL_CTXT_DIS | + QIB_RCVCTRL_INTRAVAIL_DIS | + QIB_RCVCTRL_TAILUPD_DIS, -1); + /* Redundant across ports for some, but no big deal. */ + dd->f_sendctrl(dd->pport + i, QIB_SENDCTRL_SEND_DIS | + QIB_SENDCTRL_AVAIL_DIS); + } + + return 0; +} + +static void enable_chip(struct qib_devdata *dd) +{ + u64 rcvmask; + int i; + + /* + * Enable PIO send, and update of PIOavail regs to memory. + */ + for (i = 0; i < dd->num_pports; ++i) + dd->f_sendctrl(dd->pport + i, QIB_SENDCTRL_SEND_ENB | + QIB_SENDCTRL_AVAIL_ENB); + /* + * Enable kernel ctxts' receive and receive interrupt. + * Other ctxts done as user opens and inits them. + */ + rcvmask = QIB_RCVCTRL_CTXT_ENB | QIB_RCVCTRL_INTRAVAIL_ENB; + rcvmask |= (dd->flags & QIB_NODMA_RTAIL) ? + QIB_RCVCTRL_TAILUPD_DIS : QIB_RCVCTRL_TAILUPD_ENB; + for (i = 0; dd->rcd && i < dd->first_user_ctxt; ++i) { + struct qib_ctxtdata *rcd = dd->rcd[i]; + + if (rcd) + dd->f_rcvctrl(rcd->ppd, rcvmask, i); + } +} + +static void verify_interrupt(unsigned long opaque) +{ + struct qib_devdata *dd = (struct qib_devdata *) opaque; + + if (!dd) + return; /* being torn down */ + + /* + * If we don't have a lid or any interrupts, let the user know and + * don't bother checking again. + */ + if (dd->int_counter == 0) { + if (!dd->f_intr_fallback(dd)) + dev_err(&dd->pcidev->dev, "No interrupts detected, " + "not usable.\n"); + else /* re-arm the timer to see if fallback works */ + mod_timer(&dd->intrchk_timer, jiffies + HZ/2); + } else + qib_cdbg(INIT, "%u interrupts at timer check\n", + dd->int_counter); +} + +static void init_piobuf_state(struct qib_devdata *dd) +{ + int i, pidx; + u32 uctxts; + + /* + * Ensure all buffers are free, and fifos empty. Buffers + * are common, so only do once for port 0. + * + * After enable and qib_chg_pioavailkernel so we can safely + * enable pioavail updates and PIOENABLE. After this, packets + * are ready and able to go out. + */ + dd->f_sendctrl(dd->pport, QIB_SENDCTRL_DISARM_ALL); + for (pidx = 0; pidx < dd->num_pports; ++pidx) + dd->f_sendctrl(dd->pport + pidx, QIB_SENDCTRL_FLUSH); + + /* + * If not all sendbufs are used, add the one to each of the lower + * numbered contexts. pbufsctxt and lastctxt_piobuf are + * calculated in chip-specific code because it may cause some + * chip-specific adjustments to be made. + */ + uctxts = dd->cfgctxts - dd->first_user_ctxt; + qib_cdbg(INIT, "%d sendbufs for each of %u user ctxts\n", + dd->pbufsctxt, uctxts); + dd->ctxts_extrabuf = dd->pbufsctxt ? + dd->lastctxt_piobuf - (dd->pbufsctxt * uctxts) : 0; + if (dd->ctxts_extrabuf) + qib_cdbg(INIT, "%u pbufs/ctxt leaves some unused, add a " + "buffer to ctxts %u - %u\n", dd->pbufsctxt, + dd->first_user_ctxt, + dd->first_user_ctxt + dd->ctxts_extrabuf - 1); + + /* + * Set up the shadow copies of the piobufavail registers, + * which we compare against the chip registers for now, and + * the in memory DMA'ed copies of the registers. + * By now pioavail updates to memory should have occurred, so + * copy them into our working/shadow registers; this is in + * case something went wrong with abort, but mostly to get the + * initial values of the generation bit correct. + */ + for (i = 0; i < dd->pioavregs; i++) { + __le64 tmp; + + tmp = dd->pioavailregs_dma[i]; + /* + * Don't need to worry about pioavailkernel here + * because we will call qib_chg_pioavailkernel() later + * in initialization, to busy out buffers as needed. + */ + dd->pioavailshadow[i] = le64_to_cpu(tmp); + } + while (i < ARRAY_SIZE(dd->pioavailshadow)) + dd->pioavailshadow[i++] = 0; /* for debugging sanity */ + + /* after pioavailshadow is setup */ + qib_chg_pioavailkernel(dd, 0, dd->piobcnt2k + dd->piobcnt4k, + TXCHK_CHG_TYPE_KERN, NULL); + dd->f_initvl15_bufs(dd); +} + +/** + * qib_init - do the actual initialization sequence on the chip + * @dd: the qlogic_ib device + * @reinit: reinitializing, so don't allocate new memory + * + * Do the actual initialization sequence on the chip. This is done + * both from the init routine called from the PCI infrastructure, and + * when we reset the chip, or detect that it was reset internally, + * or it's administratively re-enabled. + * + * Memory allocation here and in called routines is only done in + * the first case (reinit == 0). We have to be careful, because even + * without memory allocation, we need to re-write all the chip registers + * TIDs, etc. after the reset or enable has completed. + */ +int qib_init(struct qib_devdata *dd, int reinit) +{ + int ret = 0, pidx, lastfail = 0; + u32 portok = 0; + unsigned i; + struct qib_ctxtdata *rcd; + struct qib_pportdata *ppd; + unsigned long flags; + + /* Set linkstate to unknown, so we can watch for a transition. */ + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags &= ~(QIBL_LINKACTIVE | QIBL_LINKARMED | + QIBL_LINKDOWN | QIBL_LINKINIT | + QIBL_LINKV); + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + } + + if (reinit) + ret = init_after_reset(dd); + else + ret = loadtime_init(dd); + if (ret) + goto done; + + /* Bypass most chip-init, to get to device creation */ + if (qib_mini_init) + return 0; + + ret = dd->f_late_initreg(dd); + if (ret) + goto done; + + /* dd->rcd can be NULL if early init failed */ + for (i = 0; dd->rcd && i < dd->first_user_ctxt; ++i) { + /* + * Set up the (kernel) rcvhdr queue and egr TIDs. If doing + * re-init, the simplest way to handle this is to free + * existing, and re-allocate. + * Need to re-create rest of ctxt 0 ctxtdata as well. + */ + rcd = dd->rcd[i]; + if (!rcd) + continue; + + lastfail = qib_create_rcvhdrq(dd, rcd); + if (!lastfail) + lastfail = qib_setup_eagerbufs(rcd); + if (lastfail) { + qib_dev_err(dd, "failed to allocate kernel ctxt's " + "rcvhdrq and/or egr bufs\n"); + continue; + } + } + + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + int mtu; + if (lastfail) + ret = lastfail; + ppd = dd->pport + pidx; + mtu = ib_mtu_enum_to_int(qib_ibmtu); + if (mtu == -1) { + mtu = QIB_DEFAULT_MTU; + qib_ibmtu = 0; /* don't leave invalid value */ + } + /* set max we can ever have for this driver load */ + ppd->init_ibmaxlen = min(mtu > 2048 ? + dd->piosize4k : dd->piosize2k, + dd->rcvegrbufsize + + (dd->rcvhdrentsize << 2)); + /* + * Have to initialize ibmaxlen, but this will normally + * change immediately in qib_set_mtu(). + */ + ppd->ibmaxlen = ppd->init_ibmaxlen; + qib_set_mtu(ppd, mtu); + + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags |= QIBL_IB_LINK_DISABLED; + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + + lastfail = dd->f_bringup_serdes(ppd); + if (lastfail) { + qib_devinfo(dd->pcidev, + "Failed to bringup IB port %u\n", ppd->port); + lastfail = -ENETDOWN; + continue; + } + + /* let link come up, and enable IBC */ + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags &= ~QIBL_IB_LINK_DISABLED; + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + portok++; + } + + if (!portok) { + /* none of the ports initialized */ + if (!ret && lastfail) + ret = lastfail; + else if (!ret) + ret = -ENETDOWN; + /* but continue on, so we can debug cause */ + } + + enable_chip(dd); + + init_piobuf_state(dd); + +done: + if (!ret) { + /* chip is OK for user apps; mark it as initialized */ + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + /* + * Set status even if port serdes is not initialized + * so that diags will work. + */ + *ppd->statusp |= QIB_STATUS_CHIP_PRESENT | + QIB_STATUS_INITTED; + if (!ppd->link_speed_enabled) + continue; + if (dd->flags & QIB_HAS_SEND_DMA) + ret = setup_sdma(ppd); + init_timer(&ppd->hol_timer); + ppd->hol_timer.function = qib_hol_event; + ppd->hol_timer.data = (unsigned long)ppd; + ppd->hol_state = QIB_HOL_UP; + } + + /* now we can enable all interrupts from the chip */ + dd->f_set_intr_state(dd, 1); + + /* + * Setup to verify we get an interrupt, and fallback + * to an alternate if necessary and possible. + */ + mod_timer(&dd->intrchk_timer, jiffies + HZ/2); + /* start stats retrieval timer */ + mod_timer(&dd->stats_timer, jiffies + HZ * ACTIVITY_TIMER); + } else + qib_dbg("Failed (%d) to initialize unit %u\n", + ret, dd->unit); + + /* if ret is non-zero, we probably should do some cleanup here... */ + return ret; +} + +/* + * These next two routines are placeholders in case we don't have per-arch + * code for controlling write combining. If explicit control of write + * combining is not available, performance will probably be awful. + */ + +int __attribute__((weak)) qib_enable_wc(struct qib_devdata *dd) +{ + return -EOPNOTSUPP; +} + +void __attribute__((weak)) qib_disable_wc(struct qib_devdata *dd) +{ +} + +static inline struct qib_devdata *__qib_lookup(int unit) +{ + return idr_find(&qib_unit_table, unit); +} + +struct qib_devdata *qib_lookup(int unit) +{ + struct qib_devdata *dd; + unsigned long flags; + + spin_lock_irqsave(&qib_devs_lock, flags); + dd = __qib_lookup(unit); + spin_unlock_irqrestore(&qib_devs_lock, flags); + + return dd; +} + +/* + * Stop the timers during unit shutdown, or after an error late + * in initialization. + */ +static void qib_stop_timers(struct qib_devdata *dd) +{ + struct qib_pportdata *ppd; + int pidx; + + if (dd->stats_timer.data) { + del_timer_sync(&dd->stats_timer); + dd->stats_timer.data = 0; + } + if (dd->intrchk_timer.data) { + del_timer_sync(&dd->intrchk_timer); + dd->intrchk_timer.data = 0; + } + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + if (ppd->hol_timer.data) + del_timer_sync(&ppd->hol_timer); + if (ppd->led_override_timer.data) { + del_timer_sync(&ppd->led_override_timer); + atomic_set(&ppd->led_override_timer_active, 0); + } + } +} + +/** + * qib_shutdown_device - shut down a device + * @dd: the qlogic_ib device + * + * This is called to make the device quiet when we are about to + * unload the driver, and also when the device is administratively + * disabled. It does not free any data structures. + * Everything it does has to be setup again by qib_init(dd, 1) + */ +static void qib_shutdown_device(struct qib_devdata *dd) +{ + struct qib_pportdata *ppd; + unsigned pidx; + + qib_dbg("Shutting down the device\n"); + + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + + spin_lock_irq(&ppd->lflags_lock); + ppd->lflags &= ~(QIBL_LINKDOWN | QIBL_LINKINIT | + QIBL_LINKARMED | QIBL_LINKACTIVE | + QIBL_LINKV); + spin_unlock_irq(&ppd->lflags_lock); + *ppd->statusp &= ~(QIB_STATUS_IB_CONF | QIB_STATUS_IB_READY); + } + dd->flags &= ~QIB_INITTED; + + /* mask interrupts, but not errors */ + dd->f_set_intr_state(dd, 0); + + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + dd->f_rcvctrl(ppd, QIB_RCVCTRL_TAILUPD_DIS | + QIB_RCVCTRL_CTXT_DIS | + QIB_RCVCTRL_INTRAVAIL_DIS | + QIB_RCVCTRL_PKEY_ENB, -1); + /* + * Gracefully stop all sends allowing any in progress to + * trickle out first. + */ + dd->f_sendctrl(ppd, QIB_SENDCTRL_CLEAR); + } + + /* + * Enough for anything that's going to trickle out to have actually + * done so. + */ + udelay(20); + + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + dd->f_setextled(ppd, 0); /* make sure LEDs are off */ + + if (dd->flags & QIB_HAS_SEND_DMA) + teardown_sdma(ppd); + + dd->f_sendctrl(ppd, QIB_SENDCTRL_AVAIL_DIS | + QIB_SENDCTRL_SEND_DIS); + /* + * Clear SerdesEnable. + * We can't count on interrupts since we are stopping. + */ + dd->f_quiet_serdes(ppd); + } + + qib_cdbg(VERBOSE, "Flush time and errors to EEPROM\n"); + qib_update_eeprom_log(dd); +} + +/** + * qib_free_ctxtdata - free a context's allocated data + * @dd: the qlogic_ib device + * @rcd: the ctxtdata structure + * + * free up any allocated data for a context + * This should not touch anything that would affect a simultaneous + * re-allocation of context data, because it is called after qib_mutex + * is released (and can be called from reinit as well). + * It should never change any chip state, or global driver state. + */ +void qib_free_ctxtdata(struct qib_devdata *dd, struct qib_ctxtdata *rcd) +{ + if (!rcd) + return; + + if (rcd->rcvhdrq) { + qib_cdbg(VERBOSE, "free closed ctxt %d rcvhdrq @ %p " + "(size=%lu)\n", rcd->ctxt, rcd->rcvhdrq, + (unsigned long) rcd->rcvhdrq_size); + dma_free_coherent(&dd->pcidev->dev, rcd->rcvhdrq_size, + rcd->rcvhdrq, rcd->rcvhdrq_phys); + rcd->rcvhdrq = NULL; + if (rcd->rcvhdrtail_kvaddr) { + dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE, + rcd->rcvhdrtail_kvaddr, + rcd->rcvhdrqtailaddr_phys); + rcd->rcvhdrtail_kvaddr = NULL; + } + } + if (rcd->rcvegrbuf) { + unsigned e; + + for (e = 0; e < rcd->rcvegrbuf_chunks; e++) { + void *base = rcd->rcvegrbuf[e]; + size_t size = rcd->rcvegrbuf_size; + + qib_cdbg(VERBOSE, "egrbuf free(%p, %lu), " + "chunk %u/%u\n", base, + (unsigned long) size, + e, rcd->rcvegrbuf_chunks); + dma_free_coherent(&dd->pcidev->dev, size, + base, rcd->rcvegrbuf_phys[e]); + } + kfree(rcd->rcvegrbuf); + rcd->rcvegrbuf = NULL; + kfree(rcd->rcvegrbuf_phys); + rcd->rcvegrbuf_phys = NULL; + rcd->rcvegrbuf_chunks = 0; + } + + kfree(rcd->tid_pg_list); + vfree(rcd->user_event_mask); + vfree(rcd->subctxt_uregbase); + vfree(rcd->subctxt_rcvegrbuf); + vfree(rcd->subctxt_rcvhdr_base); + kfree(rcd); +} + +/* + * Perform a PIO buffer bandwidth write test, to verify proper system + * configuration. Even when all the setup calls work, occasionally + * BIOS or other issues can prevent write combining from working, or + * can cause other bandwidth problems to the chip. + * + * This test simply writes the same buffer over and over again, and + * measures close to the peak bandwidth to the chip (not testing + * data bandwidth to the wire). On chips that use an address-based + * trigger to send packets to the wire, this is easy. On chips that + * use a count to trigger, we want to make sure that the packet doesn't + * go out on the wire, or trigger flow control checks. + */ +static void qib_verify_pioperf(struct qib_devdata *dd) +{ + u32 pbnum, cnt, lcnt; + u32 __iomem *piobuf; + u32 *addr; + u64 msecs, emsecs; + + piobuf = dd->f_getsendbuf(dd->pport, 0ULL, &pbnum); + if (!piobuf) { + qib_devinfo(dd->pcidev, + "No PIObufs for checking perf, skipping\n"); + return; + } + + /* + * Enough to give us a reasonable test, less than piobuf size, and + * likely multiple of store buffer length. + */ + cnt = 1024; + + addr = vmalloc(cnt); + if (!addr) { + qib_devinfo(dd->pcidev, + "Couldn't get memory for checking PIO perf," + " skipping\n"); + goto done; + } + + preempt_disable(); /* we want reasonably accurate elapsed time */ + msecs = 1 + jiffies_to_msecs(jiffies); + for (lcnt = 0; lcnt < 10000U; lcnt++) { + /* wait until we cross msec boundary */ + if (jiffies_to_msecs(jiffies) >= msecs) + break; + udelay(1); + } + + dd->f_set_armlaunch(dd, 0); + + /* + * length 0, no dwords actually sent + */ + writeq(0, piobuf); + qib_flush_wc(); + + /* + * This is only roughly accurate, since even with preempt we + * still take interrupts that could take a while. Running for + * >= 5 msec seems to get us "close enough" to accurate values. + */ + msecs = jiffies_to_msecs(jiffies); + for (emsecs = lcnt = 0; emsecs <= 5UL; lcnt++) { + qib_pio_copy(piobuf + 64, addr, cnt >> 2); + emsecs = jiffies_to_msecs(jiffies) - msecs; + } + + /* 1 GiB/sec, slightly over IB SDR line rate */ + if (lcnt < (emsecs * 1024U)) + qib_dev_err(dd, + "Performance problem: bandwidth to PIO buffers is " + "only %u MiB/sec\n", + lcnt / (u32) emsecs); + else + qib_cdbg(INIT, "PIO buffer bandwidth %u MiB/sec is OK\n", + lcnt / (u32) emsecs); + + preempt_enable(); + + vfree(addr); + +done: + /* disarm piobuf, so it's available again */ + dd->f_sendctrl(dd->pport, QIB_SENDCTRL_DISARM_BUF(pbnum)); + qib_sendbuf_done(dd, pbnum); + dd->f_set_armlaunch(dd, 1); +} + + +void qib_free_devdata(struct qib_devdata *dd) +{ + unsigned long flags; + + spin_lock_irqsave(&qib_devs_lock, flags); + idr_remove(&qib_unit_table, dd->unit); + list_del(&dd->list); + spin_unlock_irqrestore(&qib_devs_lock, flags); + + ib_dealloc_device(&dd->verbs_dev.ibdev); +} + +/* + * Allocate our primary per-unit data structure. Must be done via verbs + * allocator, because the verbs cleanup process both does cleanup and + * free of the data structure. + * "extra" is for chip-specific data. + * + * Use the idr mechanism to get a unit number for this unit. + */ +struct qib_devdata *qib_alloc_devdata(struct pci_dev *pdev, size_t extra) +{ + unsigned long flags; + struct qib_devdata *dd; + int ret; + + if (!idr_pre_get(&qib_unit_table, GFP_KERNEL)) { + dd = ERR_PTR(-ENOMEM); + goto bail; + } + + dd = (struct qib_devdata *) ib_alloc_device(sizeof(*dd) + extra); + if (!dd) { + dd = ERR_PTR(-ENOMEM); + goto bail; + } + + spin_lock_irqsave(&qib_devs_lock, flags); + ret = idr_get_new(&qib_unit_table, dd, &dd->unit); + if (ret >= 0) + list_add(&dd->list, &qib_dev_list); + spin_unlock_irqrestore(&qib_devs_lock, flags); + + if (ret < 0) { + qib_early_err(&pdev->dev, + "Could not allocate unit ID: error %d\n", -ret); + ib_dealloc_device(&dd->verbs_dev.ibdev); + dd = ERR_PTR(ret); + } +bail: + return dd; +} + +/* + * Called from freeze mode handlers, and from PCI error + * reporting code. Should be paranoid about state of + * system and data structures. + */ +void qib_disable_after_error(struct qib_devdata *dd) +{ + if (dd->flags & QIB_INITTED) { + u32 pidx; + + dd->flags &= ~QIB_INITTED; + if (dd->pport) + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + struct qib_pportdata *ppd; + + ppd = dd->pport + pidx; + if (dd->flags & QIB_PRESENT) { + qib_set_linkstate(ppd, + QIB_IB_LINKDOWN_DISABLE); + dd->f_setextled(ppd, 0); + } + *ppd->statusp &= ~QIB_STATUS_IB_READY; + } + } + + /* + * Mark as having had an error for driver, and also + * for /sys and status word mapped to user programs. + * This marks unit as not usable, until reset. + */ + if (dd->devstatusp) + *dd->devstatusp |= QIB_STATUS_HWERROR; +} + +static void __devexit qib_remove_one(struct pci_dev *); +static int __devinit qib_init_one(struct pci_dev *, + const struct pci_device_id *); + +#define DRIVER_LOAD_MSG "QLogic " QIB_DRV_NAME " loaded: " +#define PFX QIB_DRV_NAME ": " + +static const struct pci_device_id qib_pci_tbl[] = { + { PCI_DEVICE(PCI_VENDOR_ID_PATHSCALE, PCI_DEVICE_ID_QLOGIC_IB_6120) }, + { PCI_DEVICE(PCI_VENDOR_ID_QLOGIC, PCI_DEVICE_ID_QLOGIC_IB_7220) }, + { PCI_DEVICE(PCI_VENDOR_ID_QLOGIC, PCI_DEVICE_ID_QLOGIC_IB_7322) }, + { 0, } +}; + +MODULE_DEVICE_TABLE(pci, qib_pci_tbl); + +struct pci_driver qib_driver = { + .name = QIB_DRV_NAME, + .probe = qib_init_one, + .remove = __devexit_p(qib_remove_one), + .id_table = qib_pci_tbl, + .err_handler = &qib_pci_err_handler, +}; + +/* + * Do all the generic driver unit- and chip-independent memory + * allocation and initialization. + */ +static int __init qlogic_ib_init(void) +{ + int ret; + + if (qib_debug & __QIB_DBG) + printk(KERN_INFO DRIVER_LOAD_MSG "%s", ib_qib_version); + + ret = qib_dev_init(); + if (ret) + goto bail; + + ret = qib_trace_init(); + if (ret) + goto bail_dev; + + /* + * These must be called before the driver is registered with + * the PCI subsystem. + */ + idr_init(&qib_unit_table); + if (!idr_pre_get(&qib_unit_table, GFP_KERNEL)) { + printk(KERN_ERR QIB_DRV_NAME ": idr_pre_get() failed\n"); + ret = -ENOMEM; + goto bail_trace; + } + + ret = pci_register_driver(&qib_driver); + if (ret < 0) { + printk(KERN_ERR QIB_DRV_NAME + ": Unable to register driver: error %d\n", -ret); + goto bail_unit; + } + + /* not fatal if it doesn't work */ + if (qib_init_qibfs()) + printk(KERN_ERR QIB_DRV_NAME ": Unable to register ipathfs\n"); + goto bail; /* all OK */ + +bail_unit: + idr_destroy(&qib_unit_table); +bail_trace: + qib_trace_fini(); +bail_dev: + qib_dev_cleanup(); +bail: + return ret; +} + +module_init(qlogic_ib_init); + +/* + * Do the non-unit driver cleanup, memory free, etc. at unload. + */ +static void __exit qlogic_ib_cleanup(void) +{ + int ret; + + ret = qib_exit_qibfs(); + if (ret) + printk(KERN_ERR QIB_DRV_NAME ": " + "Unable to cleanup counter filesystem: " + "error %d\n", -ret); + + pci_unregister_driver(&qib_driver); + + idr_destroy(&qib_unit_table); + qib_trace_fini(); + qib_dev_cleanup(); +} + +module_exit(qlogic_ib_cleanup); + +/* this can only be called after a successful initialization */ +static void cleanup_device_data(struct qib_devdata *dd) +{ + int ctxt; + int pidx; + struct qib_ctxtdata **tmp; + unsigned long flags; + + /* users can't do anything more with chip */ + for (pidx = 0; pidx < dd->num_pports; ++pidx) + if (dd->pport[pidx].statusp) + *dd->pport[pidx].statusp &= ~QIB_STATUS_CHIP_PRESENT; + + if (!qib_wc_pat) + qib_disable_wc(dd); + + if (dd->pioavailregs_dma) { + dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE, + (void *) dd->pioavailregs_dma, + dd->pioavailregs_phys); + dd->pioavailregs_dma = NULL; + } + + if (dd->pageshadow) { + struct page **tmpp = dd->pageshadow; + dma_addr_t *tmpd = dd->physshadow; + int i, cnt = 0; + + qib_cdbg(VERBOSE, "Unlocking any expTID pages still " + "locked\n"); + for (ctxt = 0; ctxt < dd->cfgctxts; ctxt++) { + int ctxt_tidbase = ctxt * dd->rcvtidcnt; + int maxtid = ctxt_tidbase + dd->rcvtidcnt; + + for (i = ctxt_tidbase; i < maxtid; i++) { + if (!tmpp[i]) + continue; + pci_unmap_page(dd->pcidev, tmpd[i], + PAGE_SIZE, PCI_DMA_FROMDEVICE); + qib_release_user_pages(&tmpp[i], 1); + tmpp[i] = NULL; + cnt++; + } + } + if (cnt) + qib_cdbg(VERBOSE, "There were still %u expTID " + "entries locked\n", cnt); + + qib_cdbg(VERBOSE, "Free shadow page tid array at %p\n", + dd->pageshadow); + tmpp = dd->pageshadow; + dd->pageshadow = NULL; + vfree(tmpp); + } + + /* + * Free any resources still in use (usually just kernel contexts) + * at unload; we do for ctxtcnt, because that's what we allocate. + * We acquire lock to be really paranoid that rcd isn't being + * accessed from some interrupt-related code (that should not happen, + * but best to be sure). + */ + spin_lock_irqsave(&dd->uctxt_lock, flags); + tmp = dd->rcd; + dd->rcd = NULL; + spin_unlock_irqrestore(&dd->uctxt_lock, flags); + for (ctxt = 0; tmp && ctxt < dd->ctxtcnt; ctxt++) { + struct qib_ctxtdata *rcd = tmp[ctxt]; + + tmp[ctxt] = NULL; /* debugging paranoia */ + qib_free_ctxtdata(dd, rcd); + } + kfree(tmp); + kfree(dd->boardname); +} + +/* + * Clean up on unit shutdown, or error during unit load after + * successful initialization. + */ +static void qib_postinit_cleanup(struct qib_devdata *dd) +{ + /* + * Clean up chip-specific stuff. + * We check for NULL here, because it's outside + * the kregbase check, and we need to call it + * after the free_irq. Thus it's possible that + * the function pointers were never initialized. + */ + if (dd->f_cleanup) + dd->f_cleanup(dd); + + qib_pcie_ddcleanup(dd); + + cleanup_device_data(dd); + + qib_free_devdata(dd); +} + +static int __devinit qib_init_one(struct pci_dev *pdev, + const struct pci_device_id *ent) +{ + int ret, j, pidx, initfail; + struct qib_devdata *dd = NULL; + + ret = qib_pcie_init(pdev, ent); + if (ret) + goto bail; + + /* + * Do device-specific initialiation, function table setup, dd + * allocation, etc. + */ + switch (ent->device) { + case PCI_DEVICE_ID_QLOGIC_IB_6120: + dd = qib_init_iba6120_funcs(pdev, ent); + break; + + case PCI_DEVICE_ID_QLOGIC_IB_7220: + dd = qib_init_iba7220_funcs(pdev, ent); + break; + + case PCI_DEVICE_ID_QLOGIC_IB_7322: + dd = qib_init_iba7322_funcs(pdev, ent); + break; + + default: + qib_early_err(&pdev->dev, "Failing on unknown QLogic " + "deviceid 0x%x\n", ent->device); + ret = -ENODEV; + } + + if (IS_ERR(dd)) + ret = PTR_ERR(dd); + if (ret) + goto bail; /* error already printed */ + + /* do the generic initialization */ + initfail = qib_init(dd, 0); + + ret = qib_register_ib_device(dd); + + /* + * Now ready for use. this should be cleared whenever we + * detect a reset, or initiate one. If earlier failure, + * we still create devices, so diags, etc. can be used + * to determine cause of problem. + */ + if (!qib_mini_init && !initfail && !ret) + dd->flags |= QIB_INITTED; + + j = qib_device_create(dd); + if (j) + qib_dev_err(dd, "Failed to create /dev devices: %d\n", -j); + j = qibfs_add(dd); + if (j) + qib_dev_err(dd, "Failed filesystem setup for counters: %d\n", + -j); + + if (qib_mini_init || initfail || ret) { + qib_stop_timers(dd); + for (pidx = 0; pidx < dd->num_pports; ++pidx) + dd->f_quiet_serdes(dd->pport + pidx); + if (initfail) + ret = initfail; + goto bail; + } + + if (!qib_wc_pat) { + ret = qib_enable_wc(dd); + if (ret) { + qib_dev_err(dd, "Write combining not enabled " + "(err %d): performance may be poor\n", + -ret); + ret = 0; + } + } + + qib_verify_pioperf(dd); +bail: + return ret; +} + +static void __devexit qib_remove_one(struct pci_dev *pdev) +{ + struct qib_devdata *dd = pci_get_drvdata(pdev); + int ret; + + qib_cdbg(VERBOSE, "removing, pdev=%p, dd=%p\n", pdev, dd); + + /* unregister from IB core */ + qib_unregister_ib_device(dd); + + /* + * Disable the IB link, disable interrupts on the device, + * clear dma engines, etc. + */ + if (!qib_mini_init) + qib_shutdown_device(dd); + + qib_stop_timers(dd); + + /* wait until all of our (qsfp) schedule_work() calls complete */ + flush_scheduled_work(); + + ret = qibfs_remove(dd); + if (ret) + qib_dev_err(dd, "Failed counters filesystem cleanup: %d\n", + -ret); + + qib_device_remove(dd); + + qib_postinit_cleanup(dd); +} + +/** + * qib_create_rcvhdrq - create a receive header queue + * @dd: the qlogic_ib device + * @rcd: the context data + * + * This must be contiguous memory (from an i/o perspective), and must be + * DMA'able (which means for some systems, it will go through an IOMMU, + * or be forced into a low address range). + */ +int qib_create_rcvhdrq(struct qib_devdata *dd, struct qib_ctxtdata *rcd) +{ + unsigned amt; + + if (!rcd->rcvhdrq) { + dma_addr_t phys_hdrqtail; + gfp_t gfp_flags; + + amt = ALIGN(dd->rcvhdrcnt * dd->rcvhdrentsize * + sizeof(u32), PAGE_SIZE); + gfp_flags = (rcd->ctxt >= dd->first_user_ctxt) ? + GFP_USER : GFP_KERNEL; + rcd->rcvhdrq = dma_alloc_coherent( + &dd->pcidev->dev, amt, &rcd->rcvhdrq_phys, + gfp_flags | __GFP_COMP); + + if (!rcd->rcvhdrq) { + qib_dev_err(dd, "attempt to allocate %d bytes " + "for ctxt %u rcvhdrq failed\n", + amt, rcd->ctxt); + goto bail; + } + + if (rcd->ctxt >= dd->first_user_ctxt) { + rcd->user_event_mask = vmalloc_user(PAGE_SIZE); + if (!rcd->user_event_mask) + goto bail_free_hdrq; + } + + if (!(dd->flags & QIB_NODMA_RTAIL)) { + rcd->rcvhdrtail_kvaddr = dma_alloc_coherent( + &dd->pcidev->dev, PAGE_SIZE, &phys_hdrqtail, + gfp_flags); + if (!rcd->rcvhdrtail_kvaddr) + goto bail_free; + rcd->rcvhdrqtailaddr_phys = phys_hdrqtail; + qib_cdbg(INIT, "ctxt %d hdrtailaddr, %llx " + "physical\n", rcd->ctxt, + (unsigned long long) phys_hdrqtail); + } + + rcd->rcvhdrq_size = amt; + + qib_cdbg(PROC, "%d pages at %p (phys %lx) size=%lu " + "for ctxt %u rcvhdr Q\n", + amt >> PAGE_SHIFT, rcd->rcvhdrq, + (unsigned long) rcd->rcvhdrq_phys, + (unsigned long) rcd->rcvhdrq_size, + rcd->ctxt); + } else + qib_cdbg(PROC, "reuse ctxt %d rcvhdrq @%p %llx phys; " + "hdrtailaddr@%p %llx physical\n", + rcd->ctxt, rcd->rcvhdrq, + (unsigned long long) rcd->rcvhdrq_phys, + rcd->rcvhdrtail_kvaddr, (unsigned long long) + rcd->rcvhdrqtailaddr_phys); + + /* clear for security and sanity on each use */ + memset(rcd->rcvhdrq, 0, rcd->rcvhdrq_size); + if (rcd->rcvhdrtail_kvaddr) + memset(rcd->rcvhdrtail_kvaddr, 0, PAGE_SIZE); + return 0; + +bail_free: + qib_dev_err(dd, "attempt to allocate 1 page for ctxt %u " + "rcvhdrqtailaddr failed\n", rcd->ctxt); + vfree(rcd->user_event_mask); + rcd->user_event_mask = NULL; +bail_free_hdrq: + dma_free_coherent(&dd->pcidev->dev, amt, rcd->rcvhdrq, + rcd->rcvhdrq_phys); + rcd->rcvhdrq = NULL; +bail: + return -ENOMEM; +} + +/** + * allocate eager buffers, both kernel and user contexts. + * @rcd: the context we are setting up. + * + * Allocate the eager TID buffers and program them into hip. + * They are no longer completely contiguous, we do multiple allocation + * calls. Otherwise we get the OOM code involved, by asking for too + * much per call, with disastrous results on some kernels. + */ +int qib_setup_eagerbufs(struct qib_ctxtdata *rcd) +{ + struct qib_devdata *dd = rcd->dd; + unsigned e, egrcnt, egrperchunk, chunk, egrsize, egroff; + size_t size; + gfp_t gfp_flags; + + /* + * GFP_USER, but without GFP_FS, so buffer cache can be + * coalesced (we hope); otherwise, even at order 4, + * heavy filesystem activity makes these fail, and we can + * use compound pages. + */ + gfp_flags = __GFP_WAIT | __GFP_IO | __GFP_COMP; + + egrcnt = rcd->rcvegrcnt; + egroff = rcd->rcvegr_tid_base; + egrsize = dd->rcvegrbufsize; + qib_cdbg(PROC, "ctxt%d: %d egr buffers, at egrtid " + "offset %x, egrsize %u\n", rcd->ctxt, egrcnt, egroff, + egrsize); + + chunk = rcd->rcvegrbuf_chunks; + egrperchunk = rcd->rcvegrbufs_perchunk; + size = rcd->rcvegrbuf_size; + if (!rcd->rcvegrbuf) { + rcd->rcvegrbuf = + kzalloc(chunk * sizeof(rcd->rcvegrbuf[0]), + GFP_KERNEL); + if (!rcd->rcvegrbuf) + goto bail; + } + if (!rcd->rcvegrbuf_phys) { + rcd->rcvegrbuf_phys = + kmalloc(chunk * sizeof(rcd->rcvegrbuf_phys[0]), + GFP_KERNEL); + if (!rcd->rcvegrbuf_phys) + goto bail_rcvegrbuf; + } + for (e = 0; e < rcd->rcvegrbuf_chunks; e++) { + if (rcd->rcvegrbuf[e]) + continue; + rcd->rcvegrbuf[e] = + dma_alloc_coherent(&dd->pcidev->dev, size, + &rcd->rcvegrbuf_phys[e], + gfp_flags); + if (!rcd->rcvegrbuf[e]) + goto bail_rcvegrbuf_phys; + } + + rcd->rcvegr_phys = rcd->rcvegrbuf_phys[0]; + + for (e = chunk = 0; chunk < rcd->rcvegrbuf_chunks; chunk++) { + dma_addr_t pa = rcd->rcvegrbuf_phys[chunk]; + unsigned i; + + for (i = 0; e < egrcnt && i < egrperchunk; e++, i++) { + dd->f_put_tid(dd, e + egroff + + (u64 __iomem *) + ((char __iomem *) + dd->kregbase + + dd->rcvegrbase), + RCVHQ_RCV_TYPE_EAGER, pa); + pa += egrsize; + } + cond_resched(); /* don't hog the cpu */ + } + + return 0; + +bail_rcvegrbuf_phys: + for (e = 0; e < rcd->rcvegrbuf_chunks && rcd->rcvegrbuf[e]; e++) + dma_free_coherent(&dd->pcidev->dev, size, + rcd->rcvegrbuf[e], rcd->rcvegrbuf_phys[e]); + kfree(rcd->rcvegrbuf_phys); + rcd->rcvegrbuf_phys = NULL; +bail_rcvegrbuf: + kfree(rcd->rcvegrbuf); + rcd->rcvegrbuf = NULL; +bail: + return -ENOMEM; +} + +int init_chip_wc_pat(struct qib_devdata *dd, u32 vl15buflen) +{ + u64 __iomem *qib_kregbase = NULL; + void __iomem *qib_piobase = NULL; + u64 __iomem *qib_userbase = NULL; + u64 qib_kreglen; + u64 qib_pio2koffset = dd->piobufbase & 0xffffffff; + u64 qib_pio4koffset = dd->piobufbase >> 32; + u64 qib_pio2klen = dd->piobcnt2k * dd->palign; + u64 qib_pio4klen = dd->piobcnt4k * dd->align4k; + u64 qib_physaddr = dd->physaddr; + u64 qib_piolen; + u64 qib_userlen = 0; + + /* + * Free the old mapping because the kernel will try to reuse the + * old mapping and not create a new mapping with the + * write combining attribute. + */ + iounmap(dd->kregbase); + dd->kregbase = NULL; + + /* + * Assumes chip address space looks like: + * - kregs + sregs + cregs + uregs (in any order) + * - piobufs (2K and 4K bufs in either order) + * or: + * - kregs + sregs + cregs (in any order) + * - piobufs (2K and 4K bufs in either order) + * - uregs + */ + if (dd->piobcnt4k == 0) { + qib_kreglen = qib_pio2koffset; + qib_piolen = qib_pio2klen; + } else if (qib_pio2koffset < qib_pio4koffset) { + qib_kreglen = qib_pio2koffset; + qib_piolen = qib_pio4koffset + qib_pio4klen - qib_kreglen; + } else { + qib_kreglen = qib_pio4koffset; + qib_piolen = qib_pio2koffset + qib_pio2klen - qib_kreglen; + } + qib_piolen += vl15buflen; + /* Map just the configured ports (not all hw ports) */ + if (dd->uregbase > qib_kreglen) + qib_userlen = dd->ureg_align * dd->cfgctxts; + + /* Sanity checks passed, now create the new mappings */ + qib_kregbase = ioremap_nocache(qib_physaddr, qib_kreglen); + if (!qib_kregbase) { + qib_dbg("Unable to remap io addr %llx to kvirt\n", + qib_physaddr); + goto bail; + } + qib_cdbg(VERBOSE, "WC PAT remapped io addr %llx" + " to kregbase %p for %llx bytes\n", + qib_physaddr, qib_kregbase, qib_kreglen); + + qib_piobase = ioremap_wc(qib_physaddr + qib_kreglen, qib_piolen); + if (!qib_piobase) { + qib_dbg("Unable to remap io addr %llx to kvirt\n", + qib_physaddr + qib_kreglen); + goto bail_kregbase; + } + qib_cdbg(VERBOSE, "WC PAT remapped io addr %llx" + " to piobase %p for %llx bytes\n", + qib_physaddr + qib_kreglen, qib_piobase, qib_piolen); + + if (qib_userlen) { + qib_userbase = ioremap_nocache(qib_physaddr + dd->uregbase, + qib_userlen); + if (!qib_userbase) { + qib_dbg("Unable to remap io addr %llx to kvirt\n", + qib_physaddr + dd->uregbase); + goto bail_piobase; + } + qib_cdbg(VERBOSE, "WC PAT remapped io addr %llx" + " to userbase %p for %llx bytes\n", + qib_physaddr + dd->uregbase, + qib_userbase, qib_userlen); + } + + dd->kregbase = qib_kregbase; + dd->kregend = (u64 __iomem *) + ((char __iomem *) qib_kregbase + qib_kreglen); + dd->piobase = qib_piobase; + dd->pio2kbase = (void __iomem *) + (((char __iomem *) dd->piobase) + + qib_pio2koffset - qib_kreglen); + if (dd->piobcnt4k) + dd->pio4kbase = (void __iomem *) + (((char __iomem *) dd->piobase) + + qib_pio4koffset - qib_kreglen); + if (qib_userlen) + /* ureg will now be accessed relative to dd->userbase */ + dd->userbase = qib_userbase; + qib_cdbg(VERBOSE, "New addrs: kreg %p pio2k %p pio4k %p user %p\n", + dd->kregbase, dd->pio2kbase, dd->pio4kbase, dd->userbase); + return 0; + +bail_piobase: + iounmap(qib_piobase); +bail_kregbase: + iounmap(qib_kregbase); +bail: + return -ENOMEM; +} -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at http://vger.kernel.org/majordomo-info.html