From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1756852Ab2APVT2 (ORCPT ); Mon, 16 Jan 2012 16:19:28 -0500 Received: from relay1.sgi.com ([192.48.179.29]:56886 "EHLO relay.sgi.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1755304Ab2APVT1 (ORCPT ); Mon, 16 Jan 2012 16:19:27 -0500 Date: Mon, 16 Jan 2012 15:19:47 -0600 From: Cliff Wickman To: linux-kernel@vger.kernel.org, mingo@elte.hu, x86@kernel.org Cc: mingo@elte.hu, x86@kernel.org Subject: [PATCH 3/6] x86: workaround for UV2 BAU bug Message-ID: <20120116211947.GC5767@sgi.com> References: <20120116211617.GD5512@sgi.com> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <20120116211617.GD5512@sgi.com> User-Agent: Mutt/1.5.17+20080114 (2008-01-14) Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org This patch implements a workaround for a UV2 hardware bug. The bug is a non-atomic update of a memory-mapped register. When hardware message delivery and software message acknowledge occur simultaneously the pending message acknowledge for the arriving message may be lost. This causes the sender's message status to stay busy. Part of the workaround is to not acknowledge a completed message until it is verified that no other message is actually using the resource that is mistakenly recorded in the completed message. Part of the workaround is to test for long elapsed time in such a busy condition, then handle it by using a spare sending descriptor. The stay-busy condition is eventually timed out by hardware, and then the original sending descriptor can be re-used. Most of that logic change is in keeping track of the current descriptor and the state of the spares. The occurrences of the workaround are added to the BAU statistics. Signed-off-by: Cliff Wickman --- arch/x86/include/asm/uv/uv_bau.h | 13 + arch/x86/platform/uv/tlb_uv.c | 274 ++++++++++++++++++++++++++++++++++----- 2 files changed, 254 insertions(+), 33 deletions(-) Index: 120113.linux-3.2.1/arch/x86/include/asm/uv/uv_bau.h =================================================================== --- 120113.linux-3.2.1.orig/arch/x86/include/asm/uv/uv_bau.h +++ 120113.linux-3.2.1/arch/x86/include/asm/uv/uv_bau.h @@ -167,6 +167,7 @@ #define FLUSH_RETRY_TIMEOUT 2 #define FLUSH_GIVEUP 3 #define FLUSH_COMPLETE 4 +#define FLUSH_RETRY_BUSYBUG 5 /* * tuning the action when the numalink network is extremely delayed @@ -463,7 +464,6 @@ struct bau_pq_entry { struct msg_desc { struct bau_pq_entry *msg; int msg_slot; - int swack_slot; struct bau_pq_entry *queue_first; struct bau_pq_entry *queue_last; }; @@ -517,6 +517,9 @@ struct ptc_stats { unsigned long s_retry_messages; /* retry broadcasts */ unsigned long s_bau_reenabled; /* for bau enable/disable */ unsigned long s_bau_disabled; /* for bau enable/disable */ + unsigned long s_uv2_wars; /* uv2 workaround, perm. busy */ + unsigned long s_uv2_wars_hw; /* uv2 workaround, hiwater */ + unsigned long s_uv2_war_waits; /* uv2 workaround, long waits */ /* destination statistics */ unsigned long d_alltlb; /* times all tlb's on this cpu were flushed */ @@ -593,6 +596,8 @@ struct bau_control { short cpus_in_socket; short cpus_in_uvhub; short partition_base_pnode; + short using_desc; /* an index, like uvhub_cpu */ + unsigned int inuse_map; unsigned short message_number; unsigned short uvhub_quiesce; short socket_acknowledge_count[DEST_Q_SIZE]; @@ -610,6 +615,7 @@ struct bau_control { int cong_response_us; int cong_reps; int cong_period; + unsigned long clocks_per_100_usec; cycles_t period_time; long period_requests; struct hub_and_pnode *thp; @@ -670,6 +676,11 @@ static inline void write_mmr_sw_ack(unsi uv_write_local_mmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, mr); } +static inline void write_gmmr_sw_ack(int pnode, unsigned long mr) +{ + write_gmmr(pnode, UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, mr); +} + static inline unsigned long read_mmr_sw_ack(void) { return read_lmmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE); Index: 120113.linux-3.2.1/arch/x86/platform/uv/tlb_uv.c =================================================================== --- 120113.linux-3.2.1.orig/arch/x86/platform/uv/tlb_uv.c +++ 120113.linux-3.2.1/arch/x86/platform/uv/tlb_uv.c @@ -157,13 +157,14 @@ static int __init uvhub_to_first_apicid( * clear of the Timeout bit (as well) will free the resource. No reply will * be sent (the hardware will only do one reply per message). */ -static void reply_to_message(struct msg_desc *mdp, struct bau_control *bcp) +static void reply_to_message(struct msg_desc *mdp, struct bau_control *bcp, + int do_acknowledge) { unsigned long dw; struct bau_pq_entry *msg; msg = mdp->msg; - if (!msg->canceled) { + if (!msg->canceled && do_acknowledge) { dw = (msg->swack_vec << UV_SW_ACK_NPENDING) | msg->swack_vec; write_mmr_sw_ack(dw); } @@ -212,8 +213,8 @@ static void bau_process_retry_msg(struct if (mmr & (msg_res << UV_SW_ACK_NPENDING)) { unsigned long mr; /* - * is the resource timed out? - * make everyone ignore the cancelled message. + * Is the resource timed out? + * Make everyone ignore the cancelled message. */ msg2->canceled = 1; stat->d_canceled++; @@ -231,8 +232,8 @@ static void bau_process_retry_msg(struct * Do all the things a cpu should do for a TLB shootdown message. * Other cpu's may come here at the same time for this message. */ -static void bau_process_message(struct msg_desc *mdp, - struct bau_control *bcp) +static void bau_process_message(struct msg_desc *mdp, struct bau_control *bcp, + int do_acknowledge) { short socket_ack_count = 0; short *sp; @@ -284,8 +285,9 @@ static void bau_process_message(struct m if (msg_ack_count == bcp->cpus_in_uvhub) { /* * All cpus in uvhub saw it; reply + * (unless we are in the UV2 workaround) */ - reply_to_message(mdp, bcp); + reply_to_message(mdp, bcp, do_acknowledge); } } @@ -491,27 +493,138 @@ static int uv1_wait_completion(struct ba /* * UV2 has an extra bit of status in the ACTIVATION_STATUS_2 register. */ -static unsigned long uv2_read_status(unsigned long offset, int rshft, int cpu) +static unsigned long uv2_read_status(unsigned long offset, int rshft, int desc) { unsigned long descriptor_status; unsigned long descriptor_status2; descriptor_status = ((read_lmmr(offset) >> rshft) & UV_ACT_STATUS_MASK); - descriptor_status2 = (read_mmr_uv2_status() >> cpu) & 0x1UL; + descriptor_status2 = (read_mmr_uv2_status() >> desc) & 0x1UL; descriptor_status = (descriptor_status << 1) | descriptor_status2; return descriptor_status; } +/* + * Return whether the status of the descriptor that is normally used for this + * cpu (the one indexed by its hub-relative cpu number) is busy. + * The status of the original 32 descriptors is always reflected in the 64 + * bits of UVH_LB_BAU_SB_ACTIVATION_STATUS_0. + * The bit provided by the activation_status_2 register is irrelevant to + * the status if it is only being tested for busy or not busy. + */ +int normal_busy(struct bau_control *bcp) +{ + int cpu = bcp->uvhub_cpu; + int mmr_offset; + int right_shift; + + mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0; + right_shift = cpu * UV_ACT_STATUS_SIZE; + return (((((read_lmmr(mmr_offset) >> right_shift) & + UV_ACT_STATUS_MASK)) << 1) == UV2H_DESC_BUSY); +} + +/* + * Entered when a bau descriptor has gone into a permanent busy wait because + * of a hardware bug. + * Workaround the bug. + */ +int handle_uv2_busy(struct bau_control *bcp) +{ + int busy_one = bcp->using_desc; + int normal = bcp->uvhub_cpu; + int selected = -1; + int i; + unsigned long descriptor_status; + unsigned long status; + int mmr_offset; + struct bau_desc *bau_desc_old; + struct bau_desc *bau_desc_new; + struct bau_control *hmaster = bcp->uvhub_master; + struct ptc_stats *stat = bcp->statp; + cycles_t ttm; + + stat->s_uv2_wars++; + spin_lock(&hmaster->uvhub_lock); + /* try for the original first */ + if (busy_one != normal) { + if (!normal_busy(bcp)) + selected = normal; + } + if (selected < 0) { + /* can't use the normal, select an alternate */ + mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1; + descriptor_status = read_lmmr(mmr_offset); + + /* scan available descriptors 32-63 */ + for (i = 0; i < UV_CPUS_PER_AS; i++) { + if ((hmaster->inuse_map & (1 << i)) == 0) { + status = ((descriptor_status >> + (i * UV_ACT_STATUS_SIZE)) & + UV_ACT_STATUS_MASK) << 1; + if (status != UV2H_DESC_BUSY) { + selected = i + UV_CPUS_PER_AS; + break; + } + } + } + } + + if (busy_one != normal) + /* mark the busy alternate as not in-use */ + hmaster->inuse_map &= ~(1 << (busy_one - UV_CPUS_PER_AS)); + + if (selected >= 0) { + /* switch to the selected descriptor */ + if (selected != normal) { + /* set the selected alternate as in-use */ + hmaster->inuse_map |= + (1 << (selected - UV_CPUS_PER_AS)); + if (selected > stat->s_uv2_wars_hw) + stat->s_uv2_wars_hw = selected; + } + bau_desc_old = bcp->descriptor_base; + bau_desc_old += (ITEMS_PER_DESC * busy_one); + bcp->using_desc = selected; + bau_desc_new = bcp->descriptor_base; + bau_desc_new += (ITEMS_PER_DESC * selected); + *bau_desc_new = *bau_desc_old; + } else { + /* + * All are busy. Wait for the normal one for this cpu to + * free up. + */ + stat->s_uv2_war_waits++; + spin_unlock(&hmaster->uvhub_lock); + ttm = get_cycles(); + do { + cpu_relax(); + } while (normal_busy(bcp)); + spin_lock(&hmaster->uvhub_lock); + /* switch to the original descriptor */ + bcp->using_desc = normal; + bau_desc_old = bcp->descriptor_base; + bau_desc_old += (ITEMS_PER_DESC * bcp->using_desc); + bcp->using_desc = (ITEMS_PER_DESC * normal); + bau_desc_new = bcp->descriptor_base; + bau_desc_new += (ITEMS_PER_DESC * normal); + *bau_desc_new = *bau_desc_old; /* copy the entire descriptor */ + } + spin_unlock(&hmaster->uvhub_lock); + return FLUSH_RETRY_BUSYBUG; +} + static int uv2_wait_completion(struct bau_desc *bau_desc, unsigned long mmr_offset, int right_shift, struct bau_control *bcp, long try) { unsigned long descriptor_stat; cycles_t ttm; - int cpu = bcp->uvhub_cpu; + int desc = bcp->using_desc; + long busy_reps = 0; struct ptc_stats *stat = bcp->statp; - descriptor_stat = uv2_read_status(mmr_offset, right_shift, cpu); + descriptor_stat = uv2_read_status(mmr_offset, right_shift, desc); /* spin on the status MMR, waiting for it to go idle */ while (descriptor_stat != UV2H_DESC_IDLE) { @@ -542,12 +655,23 @@ static int uv2_wait_completion(struct ba bcp->conseccompletes = 0; return FLUSH_RETRY_TIMEOUT; } else { + busy_reps++; + if (busy_reps > 1000000) { + /* not to hammer on the clock */ + busy_reps = 0; + ttm = get_cycles(); + if ((ttm - bcp->send_message) > + (bcp->clocks_per_100_usec)) { + return handle_uv2_busy(bcp); + } + } /* * descriptor_stat is still BUSY */ cpu_relax(); } - descriptor_stat = uv2_read_status(mmr_offset, right_shift, cpu); + descriptor_stat = uv2_read_status(mmr_offset, right_shift, + desc); } bcp->conseccompletes++; return FLUSH_COMPLETE; @@ -563,14 +687,14 @@ static int wait_completion(struct bau_de { int right_shift; unsigned long mmr_offset; - int cpu = bcp->uvhub_cpu; + int desc = bcp->using_desc; - if (cpu < UV_CPUS_PER_AS) { + if (desc < UV_CPUS_PER_AS) { mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0; - right_shift = cpu * UV_ACT_STATUS_SIZE; + right_shift = desc * UV_ACT_STATUS_SIZE; } else { mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1; - right_shift = ((cpu - UV_CPUS_PER_AS) * UV_ACT_STATUS_SIZE); + right_shift = ((desc - UV_CPUS_PER_AS) * UV_ACT_STATUS_SIZE); } if (bcp->uvhub_version == 1) @@ -752,8 +876,7 @@ static void handle_cmplt(int completion_ * Returns 1 if it gives up entirely and the original cpu mask is to be * returned to the kernel. */ -int uv_flush_send_and_wait(struct bau_desc *bau_desc, - struct cpumask *flush_mask, struct bau_control *bcp) +int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp) { int seq_number = 0; int completion_stat = 0; @@ -766,20 +889,24 @@ int uv_flush_send_and_wait(struct bau_de struct bau_control *hmaster = bcp->uvhub_master; struct uv1_bau_msg_header *uv1_hdr = NULL; struct uv2_bau_msg_header *uv2_hdr = NULL; + struct bau_desc *bau_desc; - if (bcp->uvhub_version == 1) { - uv1 = 1; + if (bcp->uvhub_version == 1) uv1_throttle(hmaster, stat); - uv1_hdr = &bau_desc->header.uv1_hdr; - } else - uv2_hdr = &bau_desc->header.uv2_hdr; while (hmaster->uvhub_quiesce) cpu_relax(); time1 = get_cycles(); do { - if (try == 0) { + bau_desc = bcp->descriptor_base; + bau_desc += (ITEMS_PER_DESC * bcp->using_desc); + if (bcp->uvhub_version == 1) { + uv1 = 1; + uv1_hdr = &bau_desc->header.uv1_hdr; + } else + uv2_hdr = &bau_desc->header.uv2_hdr; + if ((try == 0) || (completion_stat == FLUSH_RETRY_BUSYBUG)) { if (uv1) uv1_hdr->msg_type = MSG_REGULAR; else @@ -797,13 +924,14 @@ int uv_flush_send_and_wait(struct bau_de uv1_hdr->sequence = seq_number; else uv2_hdr->sequence = seq_number; - index = (1UL << AS_PUSH_SHIFT) | bcp->uvhub_cpu; + index = (1UL << AS_PUSH_SHIFT) | bcp->using_desc; bcp->send_message = get_cycles(); write_mmr_activation(index); try++; completion_stat = wait_completion(bau_desc, bcp, try); + /* UV2: wait_completion() may change the bcp->using_desc */ handle_cmplt(completion_stat, bau_desc, bcp, hmaster, stat); @@ -814,6 +942,7 @@ int uv_flush_send_and_wait(struct bau_de } cpu_relax(); } while ((completion_stat == FLUSH_RETRY_PLUGGED) || + (completion_stat == FLUSH_RETRY_BUSYBUG) || (completion_stat == FLUSH_RETRY_TIMEOUT)); time2 = get_cycles(); @@ -828,6 +957,7 @@ int uv_flush_send_and_wait(struct bau_de record_send_stats(time1, time2, bcp, stat, completion_stat, try); if (completion_stat == FLUSH_GIVEUP) + /* FLUSH_GIVEUP will fall back to using IPI's for tlb flush */ return 1; return 0; } @@ -983,7 +1113,7 @@ const struct cpumask *uv_flush_tlb_other stat->s_ntargself++; bau_desc = bcp->descriptor_base; - bau_desc += (ITEMS_PER_DESC * bcp->uvhub_cpu); + bau_desc += (ITEMS_PER_DESC * bcp->using_desc); bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE); if (set_distrib_bits(flush_mask, bcp, bau_desc, &locals, &remotes)) return NULL; @@ -996,13 +1126,86 @@ const struct cpumask *uv_flush_tlb_other * uv_flush_send_and_wait returns 0 if all cpu's were messaged, * or 1 if it gave up and the original cpumask should be returned. */ - if (!uv_flush_send_and_wait(bau_desc, flush_mask, bcp)) + if (!uv_flush_send_and_wait(flush_mask, bcp)) return NULL; else return cpumask; } /* + * Search the message queue for any 'other' message with the same software + * acknowledge resource bit vector. + */ +struct bau_pq_entry *find_another_by_swack(struct bau_pq_entry *msg, + struct bau_control *bcp, unsigned char swack_vec) +{ + struct bau_pq_entry *msg_next = msg + 1; + + if (msg_next > bcp->queue_last) + msg_next = bcp->queue_first; + while ((msg_next->swack_vec != 0) && (msg_next != msg)) { + if (msg_next->swack_vec == swack_vec) + return msg_next; + msg_next++; + if (msg_next > bcp->queue_last) + msg_next = bcp->queue_first; + } + return NULL; +} + +/* + * UV2 needs to work around a bug in which an arriving message has not + * set a bit in the UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE register. + * Such a message must be ignored. + */ +void process_uv2_message(struct msg_desc *mdp, struct bau_control *bcp) +{ + unsigned long mmr_image; + unsigned char swack_vec; + struct bau_pq_entry *msg = mdp->msg; + struct bau_pq_entry *other_msg; + + mmr_image = read_mmr_sw_ack(); + swack_vec = msg->swack_vec; + + if ((swack_vec & mmr_image) == 0) { + /* + * This message was assigned a swack resource, but no + * reserved acknowlegment is pending. + * The bug has prevented this message from setting the MMR. + * And no other message has used the same sw_ack resource. + * Do the requested shootdown but do not reply to the msg. + * (the 0 means make no acknowledge) + */ + bau_process_message(mdp, bcp, 0); + return; + } + + /* + * Some message has set the MMR 'pending' bit; it might have been + * another message. Look for that message. + */ + other_msg = find_another_by_swack(msg, bcp, msg->swack_vec); + if (other_msg) { + /* There is another. Do not ack the current one. */ + bau_process_message(mdp, bcp, 0); + /* + * Let the natural processing of that message acknowledge + * it. Don't get the processing of sw_ack's out of order. + */ + return; + } + + /* + * There is no other message using this sw_ack, so it is safe to + * acknowledge it. + */ + bau_process_message(mdp, bcp, 1); + + return; +} + +/* * The BAU message interrupt comes here. (registered by set_intr_gate) * See entry_64.S * @@ -1038,9 +1241,11 @@ void uv_bau_message_interrupt(struct pt_ count++; msgdesc.msg_slot = msg - msgdesc.queue_first; - msgdesc.swack_slot = ffs(msg->swack_vec) - 1; msgdesc.msg = msg; - bau_process_message(&msgdesc, bcp); + if (bcp->uvhub_version == 2) + process_uv2_message(&msgdesc, bcp); + else + bau_process_message(&msgdesc, bcp, 1); msg++; if (msg > msgdesc.queue_last) @@ -1158,7 +1363,7 @@ static int ptc_seq_show(struct seq_file seq_printf(file, "all one mult none retry canc nocan reset rcan "); seq_printf(file, - "disable enable\n"); + "disable enable wars warshw warwaits\n"); } if (cpu < num_possible_cpus() && cpu_online(cpu)) { stat = &per_cpu(ptcstats, cpu); @@ -1189,8 +1394,10 @@ static int ptc_seq_show(struct seq_file stat->d_nomsg, stat->d_retries, stat->d_canceled, stat->d_nocanceled, stat->d_resets, stat->d_rcanceled); - seq_printf(file, "%ld %ld\n", - stat->s_bau_disabled, stat->s_bau_reenabled); + seq_printf(file, "%ld %ld %ld %ld %ld\n", + stat->s_bau_disabled, stat->s_bau_reenabled, + stat->s_uv2_wars, stat->s_uv2_wars_hw, + stat->s_uv2_war_waits); } return 0; } @@ -1564,6 +1771,7 @@ static void pq_init(int node, int pnode) write_mmr_payload_first(pnode, pn_first); write_mmr_payload_tail(pnode, first); write_mmr_payload_last(pnode, last); + write_gmmr_sw_ack(pnode, 0xffffUL); /* in effect, all msg_type's are set to MSG_NOOP */ memset(pqp, 0, sizeof(struct bau_pq_entry) * DEST_Q_SIZE); @@ -1651,6 +1859,7 @@ static void __init init_per_cpu_tunables bcp->cong_response_us = congested_respns_us; bcp->cong_reps = congested_reps; bcp->cong_period = congested_period; + bcp->clocks_per_100_usec = usec_2_cycles(100); } } @@ -1771,6 +1980,7 @@ static int scan_sock(struct socket_desc } bcp->uvhub_master = *hmasterp; bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->blade_processor_id; + bcp->using_desc = bcp->uvhub_cpu; if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) { printk(KERN_EMERG "%d cpus per uvhub invalid\n", bcp->uvhub_cpu);