* [PATCH 7/9] habanalabs: added open_stats info ioctl
2021-06-14 12:50 [PATCH 1/9] habanalabs: print more info when failing to pin user memory Oded Gabbay
` (4 preceding siblings ...)
2021-06-14 12:50 ` [PATCH 6/9] habanalabs/gaudi: set the correct rc in case of err Oded Gabbay
@ 2021-06-14 12:50 ` Oded Gabbay
2021-06-14 12:50 ` [PATCH 8/9] habanalabs/goya: add '__force' attribute to suppress false alarm Oded Gabbay
2021-06-14 12:51 ` [PATCH 9/9] habanalabs/gaudi: print last QM PQEs on error Oded Gabbay
7 siblings, 0 replies; 11+ messages in thread
From: Oded Gabbay @ 2021-06-14 12:50 UTC (permalink / raw)
To: linux-kernel; +Cc: Yuri Nudelman
From: Yuri Nudelman <ynudelman@habana.ai>
In a system with multiple ASICs, there is a need to provide monitoring
tools with information on how long a device was opened and how many
times a device was opened.
Therefore, we add a new opcode to the INFO ioctl to provide that
information.
Signed-off-by: Yuri Nudelman <ynudelman@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
drivers/misc/habanalabs/common/device.c | 3 +++
drivers/misc/habanalabs/common/habanalabs.h | 8 +++++++
.../misc/habanalabs/common/habanalabs_drv.c | 3 +++
.../misc/habanalabs/common/habanalabs_ioctl.c | 21 +++++++++++++++++++
include/uapi/misc/habanalabs.h | 12 +++++++++++
5 files changed, 47 insertions(+)
diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index e56f5170e338..37ce38d9a1a7 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -132,6 +132,9 @@ static int hl_device_release(struct inode *inode, struct file *filp)
dev_warn(hdev->dev,
"Device is still in use because there are live CS and/or memory mappings\n");
+ hdev->last_open_session_duration_jif =
+ jiffies - hdev->last_successful_open_jif;
+
return 0;
}
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index 244fbf209d34..6c9a81c2cfe7 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -2137,6 +2137,11 @@ struct hl_mmu_funcs {
* the error will be ignored by the driver during
* device initialization. Mainly used to debug and
* workaround firmware bugs
+ * @last_successful_open_jif: timestamp (jiffies) of the last successful
+ * device open.
+ * @last_open_session_duration_jif: duration (jiffies) of the last device open
+ * session.
+ * @open_counter: number of successful device open operations.
* @in_reset: is device in reset flow.
* @curr_pll_profile: current PLL profile.
* @card_type: Various ASICs have several card types. This indicates the card
@@ -2259,6 +2264,9 @@ struct hl_device {
u64 max_power;
u64 clock_gating_mask;
u64 boot_error_status_mask;
+ u64 last_successful_open_jif;
+ u64 last_open_session_duration_jif;
+ u64 open_counter;
atomic_t in_reset;
enum hl_pll_frequency curr_pll_profile;
enum cpucp_card_types card_type;
diff --git a/drivers/misc/habanalabs/common/habanalabs_drv.c b/drivers/misc/habanalabs/common/habanalabs_drv.c
index 4d377a39df13..4194cda2d04c 100644
--- a/drivers/misc/habanalabs/common/habanalabs_drv.c
+++ b/drivers/misc/habanalabs/common/habanalabs_drv.c
@@ -187,6 +187,9 @@ int hl_device_open(struct inode *inode, struct file *filp)
hl_debugfs_add_file(hpriv);
+ hdev->open_counter++;
+ hdev->last_successful_open_jif = jiffies;
+
return 0;
out_err:
diff --git a/drivers/misc/habanalabs/common/habanalabs_ioctl.c b/drivers/misc/habanalabs/common/habanalabs_ioctl.c
index 6604d30246e6..f4dda7b4acdd 100644
--- a/drivers/misc/habanalabs/common/habanalabs_ioctl.c
+++ b/drivers/misc/habanalabs/common/habanalabs_ioctl.c
@@ -460,6 +460,24 @@ static int power_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
min((size_t) max_size, sizeof(power_info))) ? -EFAULT : 0;
}
+static int open_stats_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
+{
+ struct hl_device *hdev = hpriv->hdev;
+ u32 max_size = args->return_size;
+ struct hl_open_stats_info open_stats_info = {0};
+ void __user *out = (void __user *) (uintptr_t) args->return_pointer;
+
+ if ((!max_size) || (!out))
+ return -EINVAL;
+
+ open_stats_info.last_open_period_ms = jiffies64_to_msecs(
+ hdev->last_open_session_duration_jif);
+ open_stats_info.open_counter = hdev->open_counter;
+
+ return copy_to_user(out, &open_stats_info,
+ min((size_t) max_size, sizeof(open_stats_info))) ? -EFAULT : 0;
+}
+
static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
struct device *dev)
{
@@ -543,6 +561,9 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
case HL_INFO_POWER:
return power_info(hpriv, args);
+ case HL_INFO_OPEN_STATS:
+ return open_stats_info(hpriv, args);
+
default:
dev_err(dev, "Invalid request %d\n", args->op);
rc = -ENOTTY;
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index a47485a8d411..a47a731e4527 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -313,6 +313,7 @@ enum hl_device_status {
* HL_INFO_SYNC_MANAGER - Retrieve sync manager info per dcore
* HL_INFO_TOTAL_ENERGY - Retrieve total energy consumption
* HL_INFO_PLL_FREQUENCY - Retrieve PLL frequency
+ * HL_INFO_OPEN_STATS - Retrieve info regarding recent device open calls
*/
#define HL_INFO_HW_IP_INFO 0
#define HL_INFO_HW_EVENTS 1
@@ -331,6 +332,7 @@ enum hl_device_status {
#define HL_INFO_TOTAL_ENERGY 15
#define HL_INFO_PLL_FREQUENCY 16
#define HL_INFO_POWER 17
+#define HL_INFO_OPEN_STATS 18
#define HL_INFO_VERSION_MAX_LEN 128
#define HL_INFO_CARD_NAME_MAX_LEN 16
@@ -444,6 +446,16 @@ struct hl_pll_frequency_info {
__u16 output[HL_PLL_NUM_OUTPUTS];
};
+/**
+ * struct hl_open_stats_info - device open statistics information
+ * @open_counter: ever growing counter, increased on each successful dev open
+ * @last_open_period_ms: duration (ms) device was open last time
+ */
+struct hl_open_stats_info {
+ __u64 open_counter;
+ __u64 last_open_period_ms;
+};
+
/**
* struct hl_power_info - power information
* @power: power consumption
--
2.25.1
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH 9/9] habanalabs/gaudi: print last QM PQEs on error
2021-06-14 12:50 [PATCH 1/9] habanalabs: print more info when failing to pin user memory Oded Gabbay
` (6 preceding siblings ...)
2021-06-14 12:50 ` [PATCH 8/9] habanalabs/goya: add '__force' attribute to suppress false alarm Oded Gabbay
@ 2021-06-14 12:51 ` Oded Gabbay
7 siblings, 0 replies; 11+ messages in thread
From: Oded Gabbay @ 2021-06-14 12:51 UTC (permalink / raw)
To: linux-kernel; +Cc: Ohad Sharabi
From: Ohad Sharabi <osharabi@habana.ai>
In case QMAN has an error and stop_on_err is true, print specific
information of the "offending" command buffer batch.
If the error occurred on one of the higher CPs, the CQ pointer and size
will be printed along with (up to) last 8 PQEs of the stream.
If the error occurred in the lower CP, the CQ pointer and size will be
printed along with (up to) last 8 PQEs of ALL upper CPs as we have no
way to know which upper CP sent the job there.
This is done so higher SW levels will be able to debug their CS by
extracting the raw data of the offending command buffer batch and
examine those offline to detect the issue.
Signed-off-by: Ohad Sharabi <osharabi@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
drivers/misc/habanalabs/gaudi/gaudi.c | 218 ++++++++++++++++++++-----
drivers/misc/habanalabs/gaudi/gaudiP.h | 1 +
2 files changed, 182 insertions(+), 37 deletions(-)
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index e66433d05616..a673e404f777 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -7157,14 +7157,158 @@ static int gaudi_extract_ecc_info(struct hl_device *hdev,
return rc;
}
+/*
+ * gaudi_queue_idx_dec - decrement queue index (pi/ci) and handle wrap
+ *
+ * @idx: the current pi/ci value
+ * @q_len: the queue length (power of 2)
+ *
+ * @return the cyclically decremented index
+ */
+static inline u32 gaudi_queue_idx_dec(u32 idx, u32 q_len)
+{
+ u32 mask = q_len - 1;
+
+ /*
+ * modular decrement is equivalent to adding (queue_size -1)
+ * later we take LSBs to make sure the value is in the
+ * range [0, queue_len - 1]
+ */
+ return (idx + q_len - 1) & mask;
+}
+
+/**
+ * gaudi_print_sw_config_stream_data - print SW config stream data
+ *
+ * @hdev: pointer to the habanalabs device structure
+ * @stream: the QMAN's stream
+ * @qman_base: base address of QMAN registers block
+ */
+static void gaudi_print_sw_config_stream_data(struct hl_device *hdev, u32 stream,
+ u64 qman_base)
+{
+ u64 cq_ptr_lo, cq_ptr_hi, cq_tsize, cq_ptr;
+ u32 cq_ptr_lo_off, size;
+
+ cq_ptr_lo_off = mmTPC0_QM_CQ_PTR_LO_1 - mmTPC0_QM_CQ_PTR_LO_0;
+
+ cq_ptr_lo = qman_base + (mmTPC0_QM_CQ_PTR_LO_0 - mmTPC0_QM_BASE) +
+ stream * cq_ptr_lo_off;
+ cq_ptr_hi = cq_ptr_lo +
+ (mmTPC0_QM_CQ_PTR_HI_0 - mmTPC0_QM_CQ_PTR_LO_0);
+ cq_tsize = cq_ptr_lo +
+ (mmTPC0_QM_CQ_TSIZE_0 - mmTPC0_QM_CQ_PTR_LO_0);
+
+ cq_ptr = (((u64) RREG32(cq_ptr_hi)) << 32) | RREG32(cq_ptr_lo);
+ size = RREG32(cq_tsize);
+ dev_info(hdev->dev, "stop on err: stream: %u, addr: %#llx, size: %x\n",
+ stream, cq_ptr, size);
+}
+
+/**
+ * gaudi_print_last_pqes_on_err - print last PQEs on error
+ *
+ * @hdev: pointer to the habanalabs device structure
+ * @qid_base: first QID of the QMAN (out of 4 streams)
+ * @stream: the QMAN's stream
+ * @qman_base: base address of QMAN registers block
+ * @pr_sw_conf: if true print the SW config stream data (CQ PTR and SIZE)
+ */
+static void gaudi_print_last_pqes_on_err(struct hl_device *hdev, u32 qid_base,
+ u32 stream, u64 qman_base,
+ bool pr_sw_conf)
+{
+ u32 ci, qm_ci_stream_off, queue_len;
+ struct hl_hw_queue *q;
+ u64 pq_ci;
+ int i;
+
+ q = &hdev->kernel_queues[qid_base + stream];
+
+ qm_ci_stream_off = mmTPC0_QM_PQ_CI_1 - mmTPC0_QM_PQ_CI_0;
+ pq_ci = qman_base + (mmTPC0_QM_PQ_CI_0 - mmTPC0_QM_BASE) +
+ stream * qm_ci_stream_off;
+
+ queue_len = (q->queue_type == QUEUE_TYPE_INT) ?
+ q->int_queue_len : HL_QUEUE_LENGTH;
+
+ hdev->asic_funcs->hw_queues_lock(hdev);
+
+ if (pr_sw_conf)
+ gaudi_print_sw_config_stream_data(hdev, stream, qman_base);
+
+ ci = RREG32(pq_ci);
+
+ /* we should start printing form ci -1 */
+ ci = gaudi_queue_idx_dec(ci, queue_len);
+
+ for (i = 0; i < PQ_FETCHER_CACHE_SIZE; i++) {
+ struct hl_bd *bd;
+ u64 addr;
+ u32 len;
+
+ bd = q->kernel_address;
+ bd += ci;
+
+ len = le32_to_cpu(bd->len);
+ /* len 0 means uninitialized entry- break */
+ if (!len)
+ break;
+
+ addr = le64_to_cpu(bd->ptr);
+
+ dev_info(hdev->dev, "stop on err PQE(stream %u): ci: %u, addr: %#llx, size: %x\n",
+ stream, ci, addr, len);
+
+ /* get previous ci, wrap if needed */
+ ci = gaudi_queue_idx_dec(ci, queue_len);
+ }
+
+ hdev->asic_funcs->hw_queues_unlock(hdev);
+}
+
+/**
+ * print_qman_data_on_err - extract QMAN data on error
+ *
+ * @hdev: pointer to the habanalabs device structure
+ * @qid_base: first QID of the QMAN (out of 4 streams)
+ * @stream: the QMAN's stream
+ * @qman_base: base address of QMAN registers block
+ *
+ * This function attempt to exatract as much data as possible on QMAN error.
+ * On upper CP print the SW config stream data and last 8 PQEs.
+ * On lower CP print SW config data and last PQEs of ALL 4 upper CPs
+ */
+static void print_qman_data_on_err(struct hl_device *hdev, u32 qid_base,
+ u32 stream, u64 qman_base)
+{
+ u32 i;
+
+ if (stream != QMAN_STREAMS) {
+ gaudi_print_last_pqes_on_err(hdev, qid_base, stream, qman_base,
+ true);
+ return;
+ }
+
+ gaudi_print_sw_config_stream_data(hdev, stream, qman_base);
+
+ for (i = 0; i < QMAN_STREAMS; i++)
+ gaudi_print_last_pqes_on_err(hdev, qid_base, i, qman_base,
+ false);
+}
+
static void gaudi_handle_qman_err_generic(struct hl_device *hdev,
const char *qm_name,
- u64 glbl_sts_addr,
- u64 arb_err_addr)
+ u64 qman_base,
+ u32 qid_base)
{
u32 i, j, glbl_sts_val, arb_err_val, glbl_sts_clr_val;
+ u64 glbl_sts_addr, arb_err_addr;
char reg_desc[32];
+ glbl_sts_addr = qman_base + (mmTPC0_QM_GLBL_STS1_0 - mmTPC0_QM_BASE);
+ arb_err_addr = qman_base + (mmTPC0_QM_ARB_ERR_CAUSE - mmTPC0_QM_BASE);
+
/* Iterate through all stream GLBL_STS1 registers + Lower CP */
for (i = 0 ; i < QMAN_STREAMS + 1 ; i++) {
glbl_sts_clr_val = 0;
@@ -7191,6 +7335,8 @@ static void gaudi_handle_qman_err_generic(struct hl_device *hdev,
/* Write 1 clear errors */
if (!hdev->stop_on_err)
WREG32(glbl_sts_addr + 4 * i, glbl_sts_clr_val);
+ else
+ print_qman_data_on_err(hdev, qid_base, i, qman_base);
}
arb_err_val = RREG32(arb_err_addr);
@@ -7335,90 +7481,88 @@ static void gaudi_handle_ecc_event(struct hl_device *hdev, u16 event_type,
static void gaudi_handle_qman_err(struct hl_device *hdev, u16 event_type)
{
- u64 glbl_sts_addr, arb_err_addr;
- u8 index;
+ u64 qman_base;
char desc[32];
+ u32 qid_base;
+ u8 index;
switch (event_type) {
case GAUDI_EVENT_TPC0_QM ... GAUDI_EVENT_TPC7_QM:
index = event_type - GAUDI_EVENT_TPC0_QM;
- glbl_sts_addr =
- mmTPC0_QM_GLBL_STS1_0 + index * TPC_QMAN_OFFSET;
- arb_err_addr =
- mmTPC0_QM_ARB_ERR_CAUSE + index * TPC_QMAN_OFFSET;
+ qid_base = GAUDI_QUEUE_ID_TPC_0_0 + index * QMAN_STREAMS;
+ qman_base = mmTPC0_QM_BASE + index * TPC_QMAN_OFFSET;
snprintf(desc, ARRAY_SIZE(desc), "%s%d", "TPC_QM", index);
break;
case GAUDI_EVENT_MME0_QM ... GAUDI_EVENT_MME2_QM:
index = event_type - GAUDI_EVENT_MME0_QM;
- glbl_sts_addr =
- mmMME0_QM_GLBL_STS1_0 + index * MME_QMAN_OFFSET;
- arb_err_addr =
- mmMME0_QM_ARB_ERR_CAUSE + index * MME_QMAN_OFFSET;
+ qid_base = GAUDI_QUEUE_ID_MME_0_0 + index * QMAN_STREAMS;
+ qman_base = mmMME0_QM_BASE + index * MME_QMAN_OFFSET;
snprintf(desc, ARRAY_SIZE(desc), "%s%d", "MME_QM", index);
break;
case GAUDI_EVENT_DMA0_QM ... GAUDI_EVENT_DMA7_QM:
index = event_type - GAUDI_EVENT_DMA0_QM;
- glbl_sts_addr =
- mmDMA0_QM_GLBL_STS1_0 + index * DMA_QMAN_OFFSET;
- arb_err_addr =
- mmDMA0_QM_ARB_ERR_CAUSE + index * DMA_QMAN_OFFSET;
+ qid_base = GAUDI_QUEUE_ID_DMA_0_0 + index * QMAN_STREAMS;
+ /* skip GAUDI_QUEUE_ID_CPU_PQ if necessary */
+ if (index > 1)
+ qid_base++;
+ qman_base = mmDMA0_QM_BASE + index * DMA_QMAN_OFFSET;
snprintf(desc, ARRAY_SIZE(desc), "%s%d", "DMA_QM", index);
break;
case GAUDI_EVENT_NIC0_QM0:
- glbl_sts_addr = mmNIC0_QM0_GLBL_STS1_0;
- arb_err_addr = mmNIC0_QM0_ARB_ERR_CAUSE;
+ qid_base = GAUDI_QUEUE_ID_NIC_0_0;
+ qman_base = mmNIC0_QM0_BASE;
snprintf(desc, ARRAY_SIZE(desc), "NIC0_QM0");
break;
case GAUDI_EVENT_NIC0_QM1:
- glbl_sts_addr = mmNIC0_QM1_GLBL_STS1_0;
- arb_err_addr = mmNIC0_QM1_ARB_ERR_CAUSE;
+ qid_base = GAUDI_QUEUE_ID_NIC_1_0;
+ qman_base = mmNIC0_QM1_BASE;
snprintf(desc, ARRAY_SIZE(desc), "NIC0_QM1");
break;
case GAUDI_EVENT_NIC1_QM0:
- glbl_sts_addr = mmNIC1_QM0_GLBL_STS1_0;
- arb_err_addr = mmNIC1_QM0_ARB_ERR_CAUSE;
+ qid_base = GAUDI_QUEUE_ID_NIC_2_0;
+ qman_base = mmNIC1_QM0_BASE;
snprintf(desc, ARRAY_SIZE(desc), "NIC1_QM0");
break;
case GAUDI_EVENT_NIC1_QM1:
- glbl_sts_addr = mmNIC1_QM1_GLBL_STS1_0;
- arb_err_addr = mmNIC1_QM1_ARB_ERR_CAUSE;
+ qid_base = GAUDI_QUEUE_ID_NIC_3_0;
+ qman_base = mmNIC1_QM1_BASE;
snprintf(desc, ARRAY_SIZE(desc), "NIC1_QM1");
break;
case GAUDI_EVENT_NIC2_QM0:
- glbl_sts_addr = mmNIC2_QM0_GLBL_STS1_0;
- arb_err_addr = mmNIC2_QM0_ARB_ERR_CAUSE;
+ qid_base = GAUDI_QUEUE_ID_NIC_4_0;
+ qman_base = mmNIC2_QM0_BASE;
snprintf(desc, ARRAY_SIZE(desc), "NIC2_QM0");
break;
case GAUDI_EVENT_NIC2_QM1:
- glbl_sts_addr = mmNIC2_QM1_GLBL_STS1_0;
- arb_err_addr = mmNIC2_QM1_ARB_ERR_CAUSE;
+ qid_base = GAUDI_QUEUE_ID_NIC_5_0;
+ qman_base = mmNIC2_QM1_BASE;
snprintf(desc, ARRAY_SIZE(desc), "NIC2_QM1");
break;
case GAUDI_EVENT_NIC3_QM0:
- glbl_sts_addr = mmNIC3_QM0_GLBL_STS1_0;
- arb_err_addr = mmNIC3_QM0_ARB_ERR_CAUSE;
+ qid_base = GAUDI_QUEUE_ID_NIC_6_0;
+ qman_base = mmNIC3_QM0_BASE;
snprintf(desc, ARRAY_SIZE(desc), "NIC3_QM0");
break;
case GAUDI_EVENT_NIC3_QM1:
- glbl_sts_addr = mmNIC3_QM1_GLBL_STS1_0;
- arb_err_addr = mmNIC3_QM1_ARB_ERR_CAUSE;
+ qid_base = GAUDI_QUEUE_ID_NIC_7_0;
+ qman_base = mmNIC3_QM1_BASE;
snprintf(desc, ARRAY_SIZE(desc), "NIC3_QM1");
break;
case GAUDI_EVENT_NIC4_QM0:
- glbl_sts_addr = mmNIC4_QM0_GLBL_STS1_0;
- arb_err_addr = mmNIC4_QM0_ARB_ERR_CAUSE;
+ qid_base = GAUDI_QUEUE_ID_NIC_8_0;
+ qman_base = mmNIC4_QM0_BASE;
snprintf(desc, ARRAY_SIZE(desc), "NIC4_QM0");
break;
case GAUDI_EVENT_NIC4_QM1:
- glbl_sts_addr = mmNIC4_QM1_GLBL_STS1_0;
- arb_err_addr = mmNIC4_QM1_ARB_ERR_CAUSE;
+ qid_base = GAUDI_QUEUE_ID_NIC_9_0;
+ qman_base = mmNIC4_QM1_BASE;
snprintf(desc, ARRAY_SIZE(desc), "NIC4_QM1");
break;
default:
return;
}
- gaudi_handle_qman_err_generic(hdev, desc, glbl_sts_addr, arb_err_addr);
+ gaudi_handle_qman_err_generic(hdev, desc, qman_base, qid_base);
}
static void gaudi_print_irq_info(struct hl_device *hdev, u16 event_type,
diff --git a/drivers/misc/habanalabs/gaudi/gaudiP.h b/drivers/misc/habanalabs/gaudi/gaudiP.h
index 48637a6343bb..b23336af191e 100644
--- a/drivers/misc/habanalabs/gaudi/gaudiP.h
+++ b/drivers/misc/habanalabs/gaudi/gaudiP.h
@@ -82,6 +82,7 @@
QMAN_STREAMS)
#define QMAN_STREAMS 4
+#define PQ_FETCHER_CACHE_SIZE 8
#define DMA_QMAN_OFFSET (mmDMA1_QM_BASE - mmDMA0_QM_BASE)
#define TPC_QMAN_OFFSET (mmTPC1_QM_BASE - mmTPC0_QM_BASE)
--
2.25.1
^ permalink raw reply related [flat|nested] 11+ messages in thread