IB/ipath: Workaround problem of errormask register being overwritten

On some system hardware, we are seeing moderately common cases of the
chip errormask register being overwritten due to a chip bug in iba6120
that is triggered by a vendor-specific PCIe broadcast message.  This
patch merely checks periodically, and corrects it if needed (the
overwrite can cause us to not get error and hardware error
interrupts).  Also, make dd->ipath_errormask the one, true canonical
source for kr_errormask, and remove references to ipath_ignorederrs as
it is currently unused.

Signed-off-by: Dave Olson <dave.olson@qlogic.com>
Signed-off-by: John Gregor <john.gregor@qlogic.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
This commit is contained in:
Dave Olson 2007-07-20 14:41:26 -07:00 committed by Roland Dreier
parent 3810f2a84e
commit 78d1e02fac
4 changed files with 66 additions and 29 deletions

View file

@ -851,13 +851,14 @@ int ipath_init_chip(struct ipath_devdata *dd, int reinit)
ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask,
dd->ipath_hwerrmask);
dd->ipath_maskederrs = dd->ipath_ignorederrs;
/* clear all */
ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, -1LL);
/* enable errors that are masked, at least this first time. */
ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
~dd->ipath_maskederrs);
/* clear any interrups up to this point (ints still not enabled) */
dd->ipath_errormask = ipath_read_kreg64(dd,
dd->ipath_kregs->kr_errormask);
/* clear any interrupts up to this point (ints still not enabled) */
ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, -1LL);
/*

View file

@ -517,10 +517,7 @@ static int handle_errors(struct ipath_devdata *dd, ipath_err_t errs)
supp_msgs = handle_frequent_errors(dd, errs, msg, &noprint);
/*
* don't report errors that are masked (includes those always
* ignored)
*/
/* don't report errors that are masked */
errs &= ~dd->ipath_maskederrs;
/* do these first, they are most important */
@ -566,19 +563,19 @@ static int handle_errors(struct ipath_devdata *dd, ipath_err_t errs)
* ones on this particular interrupt, which also isn't great
*/
dd->ipath_maskederrs |= dd->ipath_lasterror | errs;
dd->ipath_errormask &= ~dd->ipath_maskederrs;
ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
~dd->ipath_maskederrs);
dd->ipath_errormask);
s_iserr = ipath_decode_err(msg, sizeof msg,
(dd->ipath_maskederrs & ~dd->
ipath_ignorederrs));
dd->ipath_maskederrs);
if ((dd->ipath_maskederrs & ~dd->ipath_ignorederrs) &
if (dd->ipath_maskederrs &
~(INFINIPATH_E_RRCVEGRFULL |
INFINIPATH_E_RRCVHDRFULL | INFINIPATH_E_PKTERRS))
ipath_dev_err(dd, "Temporarily disabling "
"error(s) %llx reporting; too frequent (%s)\n",
(unsigned long long) (dd->ipath_maskederrs &
~dd->ipath_ignorederrs), msg);
(unsigned long long)dd->ipath_maskederrs,
msg);
else {
/*
* rcvegrfull and rcvhdrqfull are "normal",
@ -793,6 +790,9 @@ void ipath_clear_freeze(struct ipath_devdata *dd)
/* disable error interrupts, to avoid confusion */
ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask, 0ULL);
/* also disable interrupts; errormask is sometimes overwriten */
ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, 0ULL);
/*
* clear all sends, because they have may been
* completed by usercode while in freeze mode, and
@ -817,7 +817,7 @@ void ipath_clear_freeze(struct ipath_devdata *dd)
for (i = 0; i < dd->ipath_pioavregs; i++) {
/* deal with 6110 chip bug */
im = i > 3 ? ((i&1) ? i-1 : i+1) : i;
val = ipath_read_kreg64(dd, 0x1000+(im*sizeof(u64)));
val = ipath_read_kreg64(dd, (0x1000/sizeof(u64))+im);
dd->ipath_pioavailregs_dma[i] = dd->ipath_pioavailshadow[i]
= le64_to_cpu(val);
}
@ -832,7 +832,8 @@ void ipath_clear_freeze(struct ipath_devdata *dd)
ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear,
E_SPKT_ERRS_IGNORE);
ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
~dd->ipath_maskederrs);
dd->ipath_errormask);
ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, -1LL);
ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, 0ULL);
}

View file

@ -261,18 +261,10 @@ struct ipath_devdata {
* limiting of hwerror reporting
*/
ipath_err_t ipath_lasthwerror;
/*
* errors masked because they occur too fast, also includes errors
* that are always ignored (ipath_ignorederrs)
*/
/* errors masked because they occur too fast */
ipath_err_t ipath_maskederrs;
/* time in jiffies at which to re-enable maskederrs */
unsigned long ipath_unmasktime;
/*
* errors always ignored (masked), at least for a given
* chip/device, because they are wrong or not useful
*/
ipath_err_t ipath_ignorederrs;
/* count of egrfull errors, combined for all ports */
u64 ipath_last_tidfull;
/* for ipath_qcheck() */
@ -436,6 +428,7 @@ struct ipath_devdata {
u64 ipath_lastibcstat;
/* hwerrmask shadow */
ipath_err_t ipath_hwerrmask;
ipath_err_t ipath_errormask; /* errormask shadow */
/* interrupt config reg shadow */
u64 ipath_intconfig;
/* kr_sendpiobufbase value */

View file

@ -196,6 +196,45 @@ static void ipath_qcheck(struct ipath_devdata *dd)
}
}
static void ipath_chk_errormask(struct ipath_devdata *dd)
{
static u32 fixed;
u32 ctrl;
unsigned long errormask;
unsigned long hwerrs;
if (!dd->ipath_errormask || !(dd->ipath_flags & IPATH_INITTED))
return;
errormask = ipath_read_kreg64(dd, dd->ipath_kregs->kr_errormask);
if (errormask == dd->ipath_errormask)
return;
fixed++;
hwerrs = ipath_read_kreg64(dd, dd->ipath_kregs->kr_hwerrstatus);
ctrl = ipath_read_kreg32(dd, dd->ipath_kregs->kr_control);
ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
dd->ipath_errormask);
if ((hwerrs & dd->ipath_hwerrmask) ||
(ctrl & INFINIPATH_C_FREEZEMODE)) {
/* force re-interrupt of pending events, just in case */
ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear, 0ULL);
ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, 0ULL);
ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, 0ULL);
dev_info(&dd->pcidev->dev,
"errormask fixed(%u) %lx -> %lx, ctrl %x hwerr %lx\n",
fixed, errormask, (unsigned long)dd->ipath_errormask,
ctrl, hwerrs);
} else
ipath_dbg("errormask fixed(%u) %lx -> %lx, no freeze\n",
fixed, errormask,
(unsigned long)dd->ipath_errormask);
}
/**
* ipath_get_faststats - get word counters from chip before they overflow
* @opaque - contains a pointer to the infinipath device ipath_devdata
@ -251,14 +290,13 @@ void ipath_get_faststats(unsigned long opaque)
dd->ipath_lasterror = 0;
if (dd->ipath_lasthwerror)
dd->ipath_lasthwerror = 0;
if ((dd->ipath_maskederrs & ~dd->ipath_ignorederrs)
if (dd->ipath_maskederrs
&& time_after(jiffies, dd->ipath_unmasktime)) {
char ebuf[256];
int iserr;
iserr = ipath_decode_err(ebuf, sizeof ebuf,
(dd->ipath_maskederrs & ~dd->
ipath_ignorederrs));
if ((dd->ipath_maskederrs & ~dd->ipath_ignorederrs) &
dd->ipath_maskederrs);
if (dd->ipath_maskederrs &
~(INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL |
INFINIPATH_E_PKTERRS ))
ipath_dev_err(dd, "Re-enabling masked errors "
@ -278,9 +316,12 @@ void ipath_get_faststats(unsigned long opaque)
ipath_cdbg(ERRPKT, "Re-enabling packet"
" problem interrupt (%s)\n", ebuf);
}
dd->ipath_maskederrs = dd->ipath_ignorederrs;
/* re-enable masked errors */
dd->ipath_errormask |= dd->ipath_maskederrs;
ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
~dd->ipath_maskederrs);
dd->ipath_errormask);
dd->ipath_maskederrs = 0;
}
/* limit qfull messages to ~one per minute per port */
@ -294,6 +335,7 @@ void ipath_get_faststats(unsigned long opaque)
}
}
ipath_chk_errormask(dd);
done:
mod_timer(&dd->ipath_stats_timer, jiffies + HZ * 5);
}