linux-trace-devel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] kernel-shark: Increase the size of the task hash
@ 2019-08-28 18:00 Steven Rostedt
  2019-08-29 12:46 ` Yordan Karadzhov (VMware)
  0 siblings, 1 reply; 5+ messages in thread
From: Steven Rostedt @ 2019-08-28 18:00 UTC (permalink / raw)
  To: Linux Trace Devel; +Cc: Yordan Karadzhov

From: Steven Rostedt (VMware) <rostedt@goodmis.org>

When loading a data file that contained 100,000s of tasks, using a 256
bucket size hash crippled it. By increasing the hash to 2^16 (65536) it
solves the issue (still small enough not to waste too much memory).

In the process, I changed the knuth_hash() in libkshark.c to use the 32
bit version and just have the key use what it needs:

  key = knuth_hash();
  key += knuth_hash >> SHIFT;
  key &= (1 << SHIFT) - 1;

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
diff --git a/kernel-shark/src/libkshark.c b/kernel-shark/src/libkshark.c
index 4201fa02..41572e18 100644
--- a/kernel-shark/src/libkshark.c
+++ b/kernel-shark/src/libkshark.c
@@ -252,19 +252,19 @@ void kshark_free(struct kshark_context *kshark_ctx)
 	free(kshark_ctx);
 }
 
-static inline uint8_t knuth_hash(uint32_t val)
+static inline uint32_t knuth_hash(uint32_t val)
 {
 	/*
-	 * Small table hashing function adapted from Donald E. Knuth's 32 bit
+	 * Hashing function adapted from Donald E. Knuth's 32 bit
 	 * multiplicative hash.  See The Art of Computer Programming (TAOCP).
 	 * Multiplication by the Prime number, closest to the golden ratio of
-	 * 2^8.
+	 * 2^32.
 	 */
-	return UINT8_C(val) * UINT8_C(157);
+	return val * UINT32_C(2654435761);
 }
 
 static struct kshark_task_list *
-kshark_find_task(struct kshark_context *kshark_ctx, uint8_t key, int pid)
+kshark_find_task(struct kshark_context *kshark_ctx, uint32_t key, int pid)
 {
 	struct kshark_task_list *list;
 
@@ -280,9 +280,12 @@ static struct kshark_task_list *
 kshark_add_task(struct kshark_context *kshark_ctx, int pid)
 {
 	struct kshark_task_list *list;
-	uint8_t key;
+	uint32_t key;
 
 	key = knuth_hash(pid);
+	key += key >> KS_TASK_HASH_SHIFT;
+	key &= (1 << KS_TASK_HASH_SHIFT) - 1;
+
 	list = kshark_find_task(kshark_ctx, key, pid);
 	if (list)
 		return list;
diff --git a/kernel-shark/src/libkshark.h b/kernel-shark/src/libkshark.h
index 04e9cbfc..3407db19 100644
--- a/kernel-shark/src/libkshark.h
+++ b/kernel-shark/src/libkshark.h
@@ -72,7 +72,8 @@ struct kshark_entry {
 };
 
 /** Size of the task's hash table. */
-#define KS_TASK_HASH_SIZE 256
+#define KS_TASK_HASH_SHIFT 16
+#define KS_TASK_HASH_SIZE (1 << KS_TASK_HASH_SHIFT)
 
 /** Linked list of tasks. */
 struct kshark_task_list {

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH] kernel-shark: Increase the size of the task hash
  2019-08-28 18:00 [PATCH] kernel-shark: Increase the size of the task hash Steven Rostedt
@ 2019-08-29 12:46 ` Yordan Karadzhov (VMware)
  2019-08-29 13:18   ` Steven Rostedt
  2019-08-29 15:49   ` Steven Rostedt
  0 siblings, 2 replies; 5+ messages in thread
From: Yordan Karadzhov (VMware) @ 2019-08-29 12:46 UTC (permalink / raw)
  To: Steven Rostedt, Linux Trace Devel



On 28.08.19 г. 21:00 ч., Steven Rostedt wrote:
> From: Steven Rostedt (VMware) <rostedt@goodmis.org>
> 
> When loading a data file that contained 100,000s of tasks, using a 256
> bucket size hash crippled it. By increasing the hash to 2^16 (65536) it
> solves the issue (still small enough not to waste too much memory).
> 
> In the process, I changed the knuth_hash() in libkshark.c to use the 32
> bit version and just have the key use what it needs:

If the size the hash table is 65536 then the multiplicative hashing 
function has to multiply the key by a prime number which is closer to 
65536 / 1.61803398875 (the so called "golden ratio" of the size).
It makes no sense to multiply by a number that is several orders of 
magnitude bigger than the size of the hash table.

> 
>    key = knuth_hash();
>    key += knuth_hash >> SHIFT;
>    key &= (1 << SHIFT) - 1;
> 
> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
> ---
> diff --git a/kernel-shark/src/libkshark.c b/kernel-shark/src/libkshark.c
> index 4201fa02..41572e18 100644
> --- a/kernel-shark/src/libkshark.c
> +++ b/kernel-shark/src/libkshark.c
> @@ -252,19 +252,19 @@ void kshark_free(struct kshark_context *kshark_ctx)
>   	free(kshark_ctx);
>   }
>   
> -static inline uint8_t knuth_hash(uint32_t val)
> +static inline uint32_t knuth_hash(uint32_t val)
>   {
>   	/*
> -	 * Small table hashing function adapted from Donald E. Knuth's 32 bit
> +	 * Hashing function adapted from Donald E. Knuth's 32 bit
>   	 * multiplicative hash.  See The Art of Computer Programming (TAOCP).
>   	 * Multiplication by the Prime number, closest to the golden ratio of
> -	 * 2^8.
> +	 * 2^32.
>   	 */
> -	return UINT8_C(val) * UINT8_C(157);
> +	return val * UINT32_C(2654435761);

So according to my understanding of the idea in the book, this line 
should look look this:

	return UINT16_C(val) * UINT8_16(65537);

and you do not need to do any additional manipulation of the key (PID).
Also the function has to return uint16_t.

Thanks!
Yordan

>   }
>   
>   static struct kshark_task_list *
> -kshark_find_task(struct kshark_context *kshark_ctx, uint8_t key, int pid)
> +kshark_find_task(struct kshark_context *kshark_ctx, uint32_t key, int pid)
>   {
>   	struct kshark_task_list *list;
>   
> @@ -280,9 +280,12 @@ static struct kshark_task_list *
>   kshark_add_task(struct kshark_context *kshark_ctx, int pid)
>   {
>   	struct kshark_task_list *list;
> -	uint8_t key;
> +	uint32_t key;
>   
>   	key = knuth_hash(pid);
> +	key += key >> KS_TASK_HASH_SHIFT;
> +	key &= (1 << KS_TASK_HASH_SHIFT) - 1;
> +
>   	list = kshark_find_task(kshark_ctx, key, pid);
>   	if (list)
>   		return list;
> diff --git a/kernel-shark/src/libkshark.h b/kernel-shark/src/libkshark.h
> index 04e9cbfc..3407db19 100644
> --- a/kernel-shark/src/libkshark.h
> +++ b/kernel-shark/src/libkshark.h
> @@ -72,7 +72,8 @@ struct kshark_entry {
>   };
>   
>   /** Size of the task's hash table. */
> -#define KS_TASK_HASH_SIZE 256
> +#define KS_TASK_HASH_SHIFT 16
> +#define KS_TASK_HASH_SIZE (1 << KS_TASK_HASH_SHIFT)
>   
>   /** Linked list of tasks. */
>   struct kshark_task_list {
> 

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] kernel-shark: Increase the size of the task hash
  2019-08-29 12:46 ` Yordan Karadzhov (VMware)
@ 2019-08-29 13:18   ` Steven Rostedt
  2019-08-29 15:49   ` Steven Rostedt
  1 sibling, 0 replies; 5+ messages in thread
From: Steven Rostedt @ 2019-08-29 13:18 UTC (permalink / raw)
  To: Yordan Karadzhov (VMware); +Cc: Linux Trace Devel

On Thu, 29 Aug 2019 15:46:24 +0300
"Yordan Karadzhov (VMware)" <y.karadz@gmail.com> wrote:

> On 28.08.19 г. 21:00 ч., Steven Rostedt wrote:
> > From: Steven Rostedt (VMware) <rostedt@goodmis.org>
> > 
> > When loading a data file that contained 100,000s of tasks, using a 256
> > bucket size hash crippled it. By increasing the hash to 2^16 (65536) it
> > solves the issue (still small enough not to waste too much memory).
> > 
> > In the process, I changed the knuth_hash() in libkshark.c to use the 32
> > bit version and just have the key use what it needs:  
> 
> If the size the hash table is 65536 then the multiplicative hashing 
> function has to multiply the key by a prime number which is closer to 
> 65536 / 1.61803398875 (the so called "golden ratio" of the size).
> It makes no sense to multiply by a number that is several orders of 
> magnitude bigger than the size of the hash table.

Does it really matter?

> 
> > 
> >    key = knuth_hash();
> >    key += knuth_hash >> SHIFT;
> >    key &= (1 << SHIFT) - 1;
> > 
> > Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
> > ---
> > diff --git a/kernel-shark/src/libkshark.c b/kernel-shark/src/libkshark.c
> > index 4201fa02..41572e18 100644
> > --- a/kernel-shark/src/libkshark.c
> > +++ b/kernel-shark/src/libkshark.c
> > @@ -252,19 +252,19 @@ void kshark_free(struct kshark_context *kshark_ctx)
> >   	free(kshark_ctx);
> >   }
> >   
> > -static inline uint8_t knuth_hash(uint32_t val)
> > +static inline uint32_t knuth_hash(uint32_t val)
> >   {
> >   	/*
> > -	 * Small table hashing function adapted from Donald E. Knuth's 32 bit
> > +	 * Hashing function adapted from Donald E. Knuth's 32 bit
> >   	 * multiplicative hash.  See The Art of Computer Programming (TAOCP).
> >   	 * Multiplication by the Prime number, closest to the golden ratio of
> > -	 * 2^8.
> > +	 * 2^32.
> >   	 */
> > -	return UINT8_C(val) * UINT8_C(157);
> > +	return val * UINT32_C(2654435761);  
> 
> So according to my understanding of the idea in the book, this line 
> should look look this:
> 
> 	return UINT16_C(val) * UINT8_16(65537);

The reason I did it the way I did, was to decouple the hashing
algorithm from the size of the hash. If you notice, if I find the size
too big, or too small, I can make a single modification (change
TASK_HASH_SHIFT to any other number less than 32), and the algorithm
still works.

I'm not tied to this solution, and since you will be maintaining this
code, I have no problem implementing it the way you suggest. I just
wanted to state that by doing so, it binds the size of the hash table
with the entire algorithm, and modifications to the number of buckets
in the hash will be more difficult to maintain.

It does not need to be perfect, and in practice, the one I proposed
worked quite well. Remember, perfection is the enemy of good enough ;-)

> 
> and you do not need to do any additional manipulation of the key (PID).
> Also the function has to return uint16_t.

I see no reason it has to return uint16_t.

-- Steve

> 
> Thanks!
> Yordan
> 
> >   }
> >   
> >   static struct kshark_task_list *
> > -kshark_find_task(struct kshark_context *kshark_ctx, uint8_t key, int pid)
> > +kshark_find_task(struct kshark_context *kshark_ctx, uint32_t key, int pid)
> >   {
> >   	struct kshark_task_list *list;
> >   
> > @@ -280,9 +280,12 @@ static struct kshark_task_list *
> >   kshark_add_task(struct kshark_context *kshark_ctx, int pid)
> >   {
> >   	struct kshark_task_list *list;
> > -	uint8_t key;
> > +	uint32_t key;
> >   
> >   	key = knuth_hash(pid);
> > +	key += key >> KS_TASK_HASH_SHIFT;
> > +	key &= (1 << KS_TASK_HASH_SHIFT) - 1;
> > +
> >   	list = kshark_find_task(kshark_ctx, key, pid);
> >   	if (list)
> >   		return list;
> > diff --git a/kernel-shark/src/libkshark.h b/kernel-shark/src/libkshark.h
> > index 04e9cbfc..3407db19 100644
> > --- a/kernel-shark/src/libkshark.h
> > +++ b/kernel-shark/src/libkshark.h
> > @@ -72,7 +72,8 @@ struct kshark_entry {
> >   };
> >   
> >   /** Size of the task's hash table. */
> > -#define KS_TASK_HASH_SIZE 256
> > +#define KS_TASK_HASH_SHIFT 16
> > +#define KS_TASK_HASH_SIZE (1 << KS_TASK_HASH_SHIFT)
> >   
> >   /** Linked list of tasks. */
> >   struct kshark_task_list {
> >   


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] kernel-shark: Increase the size of the task hash
  2019-08-29 12:46 ` Yordan Karadzhov (VMware)
  2019-08-29 13:18   ` Steven Rostedt
@ 2019-08-29 15:49   ` Steven Rostedt
  2019-09-17 15:03     ` Yordan Karadzhov (VMware)
  1 sibling, 1 reply; 5+ messages in thread
From: Steven Rostedt @ 2019-08-29 15:49 UTC (permalink / raw)
  To: Yordan Karadzhov (VMware); +Cc: Linux Trace Devel

[-- Attachment #1: Type: text/plain, Size: 4295 bytes --]

On Thu, 29 Aug 2019 15:46:24 +0300
"Yordan Karadzhov (VMware)" <y.karadz@gmail.com> wrote:

> On 28.08.19 г. 21:00 ч., Steven Rostedt wrote:
> > From: Steven Rostedt (VMware) <rostedt@goodmis.org>
> > 
> > When loading a data file that contained 100,000s of tasks, using a 256
> > bucket size hash crippled it. By increasing the hash to 2^16 (65536) it
> > solves the issue (still small enough not to waste too much memory).
> > 
> > In the process, I changed the knuth_hash() in libkshark.c to use the 32
> > bit version and just have the key use what it needs:  
> 
> If the size the hash table is 65536 then the multiplicative hashing 
> function has to multiply the key by a prime number which is closer to 
> 65536 / 1.61803398875 (the so called "golden ratio" of the size).
> It makes no sense to multiply by a number that is several orders of 
> magnitude bigger than the size of the hash table.
> 
> > 
> >    key = knuth_hash();
> >    key += knuth_hash >> SHIFT;
> >    key &= (1 << SHIFT) - 1;
> > 
> > Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
> > ---
> > diff --git a/kernel-shark/src/libkshark.c b/kernel-shark/src/libkshark.c
> > index 4201fa02..41572e18 100644
> > --- a/kernel-shark/src/libkshark.c
> > +++ b/kernel-shark/src/libkshark.c
> > @@ -252,19 +252,19 @@ void kshark_free(struct kshark_context *kshark_ctx)
> >   	free(kshark_ctx);
> >   }
> >   
> > -static inline uint8_t knuth_hash(uint32_t val)
> > +static inline uint32_t knuth_hash(uint32_t val)
> >   {
> >   	/*
> > -	 * Small table hashing function adapted from Donald E. Knuth's 32 bit
> > +	 * Hashing function adapted from Donald E. Knuth's 32 bit
> >   	 * multiplicative hash.  See The Art of Computer Programming (TAOCP).
> >   	 * Multiplication by the Prime number, closest to the golden ratio of
> > -	 * 2^8.
> > +	 * 2^32.
> >   	 */
> > -	return UINT8_C(val) * UINT8_C(157);
> > +	return val * UINT32_C(2654435761);  
> 
> So according to my understanding of the idea in the book, this line 
> should look look this:
> 
> 	return UINT16_C(val) * UINT8_16(65537);

I just tested this (see attached patches, to try it yourself, I added
the two libtraceevent fixes too). The results are *exactly* the same!

It's the prime number multiplication that is important.

They both give:

max=15441999 min=435 total=139895258 avg=2134 std2=55927

(since I didn't feel like adding a math library, the std2 is the square
of the standard deviation. To get the real std you need to take the
square root of it).

Note, its only the same if I remove the key += key >> shift part. With
that still in, it becomes a little different:

max=15440573 min=334 total=139895258 avg=2134 std2=19938

It actually does better!

 std 141.201 (sqrt(19938)) vs 236.488 (sqrt(55927))

Thus, are you OK if I keep this patch as is?

-- Steve

> 
> and you do not need to do any additional manipulation of the key (PID).
> Also the function has to return uint16_t.
> 
> Thanks!
> Yordan
> 
> >   }
> >   
> >   static struct kshark_task_list *
> > -kshark_find_task(struct kshark_context *kshark_ctx, uint8_t key, int pid)
> > +kshark_find_task(struct kshark_context *kshark_ctx, uint32_t key, int pid)
> >   {
> >   	struct kshark_task_list *list;
> >   
> > @@ -280,9 +280,12 @@ static struct kshark_task_list *
> >   kshark_add_task(struct kshark_context *kshark_ctx, int pid)
> >   {
> >   	struct kshark_task_list *list;
> > -	uint8_t key;
> > +	uint32_t key;
> >   
> >   	key = knuth_hash(pid);
> > +	key += key >> KS_TASK_HASH_SHIFT;
> > +	key &= (1 << KS_TASK_HASH_SHIFT) - 1;
> > +
> >   	list = kshark_find_task(kshark_ctx, key, pid);
> >   	if (list)
> >   		return list;
> > diff --git a/kernel-shark/src/libkshark.h b/kernel-shark/src/libkshark.h
> > index 04e9cbfc..3407db19 100644
> > --- a/kernel-shark/src/libkshark.h
> > +++ b/kernel-shark/src/libkshark.h
> > @@ -72,7 +72,8 @@ struct kshark_entry {
> >   };
> >   
> >   /** Size of the task's hash table. */
> > -#define KS_TASK_HASH_SIZE 256
> > +#define KS_TASK_HASH_SHIFT 16
> > +#define KS_TASK_HASH_SIZE (1 << KS_TASK_HASH_SHIFT)
> >   
> >   /** Linked list of tasks. */
> >   struct kshark_task_list {
> >   


[-- Attachment #2: hash32.patch --]
[-- Type: text/x-patch, Size: 2911 bytes --]

diff --git a/kernel-shark/src/libkshark.c b/kernel-shark/src/libkshark.c
index 4201fa02..f8f032e0 100644
--- a/kernel-shark/src/libkshark.c
+++ b/kernel-shark/src/libkshark.c
@@ -252,19 +252,19 @@ void kshark_free(struct kshark_context *kshark_ctx)
 	free(kshark_ctx);
 }
 
-static inline uint8_t knuth_hash(uint32_t val)
+static inline uint32_t knuth_hash(uint32_t val)
 {
 	/*
-	 * Small table hashing function adapted from Donald E. Knuth's 32 bit
+	 * Hashing function adapted from Donald E. Knuth's 32 bit
 	 * multiplicative hash.  See The Art of Computer Programming (TAOCP).
 	 * Multiplication by the Prime number, closest to the golden ratio of
-	 * 2^8.
+	 * 2^32.
 	 */
-	return UINT8_C(val) * UINT8_C(157);
+	return val * UINT32_C(2654435761);
 }
 
 static struct kshark_task_list *
-kshark_find_task(struct kshark_context *kshark_ctx, uint8_t key, int pid)
+kshark_find_task(struct kshark_context *kshark_ctx, uint32_t key, int pid)
 {
 	struct kshark_task_list *list;
 
@@ -276,13 +276,20 @@ kshark_find_task(struct kshark_context *kshark_ctx, uint8_t key, int pid)
 	return NULL;
 }
 
+static int hash_list[KS_TASK_HASH_SIZE];
+
 static struct kshark_task_list *
 kshark_add_task(struct kshark_context *kshark_ctx, int pid)
 {
 	struct kshark_task_list *list;
-	uint8_t key;
+	uint32_t key;
 
 	key = knuth_hash(pid);
+//	key += key >> KS_TASK_HASH_SHIFT;
+	key &= (1 << KS_TASK_HASH_SHIFT) - 1;
+
+	hash_list[key]++;
+
 	list = kshark_find_task(kshark_ctx, key, pid);
 	if (list)
 		return list;
@@ -680,6 +687,30 @@ static void free_rec_list(struct rec_list **rec_list, int n_cpus,
 	free(rec_list);
 }
 
+static void print_hash(void)
+{
+	int min = -1, max = -1;
+	int total = 0, avg, std = 0;
+	int i;
+
+	for (i = 0; i < KS_TASK_HASH_SIZE; i++) {
+		if (min < 0 || hash_list[i] < min)
+			min = hash_list[i];
+		if (max < 0 || hash_list[i] > max)
+			max = hash_list[i];
+		total += hash_list[i];
+	}
+	avg = total / KS_TASK_HASH_SIZE;
+	for (i = 0; i < KS_TASK_HASH_SIZE; i++) {
+		int diff = hash_list[i] - avg;
+
+		std += diff * diff;
+	}
+	std /= KS_TASK_HASH_SIZE - 1;
+	printf("max=%d min=%d total=%d avg=%d std2=%d\n",
+	       max, min, total, avg, std);
+}
+
 static ssize_t get_records(struct kshark_context *kshark_ctx,
 			   struct rec_list ***rec_list, enum rec_type type)
 {
@@ -793,6 +824,7 @@ static ssize_t get_records(struct kshark_context *kshark_ctx,
 		total += count;
 	}
 
+	print_hash();
 	*rec_list = cpu_list;
 	return total;
 
diff --git a/kernel-shark/src/libkshark.h b/kernel-shark/src/libkshark.h
index 04e9cbfc..3407db19 100644
--- a/kernel-shark/src/libkshark.h
+++ b/kernel-shark/src/libkshark.h
@@ -72,7 +72,8 @@ struct kshark_entry {
 };
 
 /** Size of the task's hash table. */
-#define KS_TASK_HASH_SIZE 256
+#define KS_TASK_HASH_SHIFT 16
+#define KS_TASK_HASH_SIZE (1 << KS_TASK_HASH_SHIFT)
 
 /** Linked list of tasks. */
 struct kshark_task_list {

[-- Attachment #3: hash16.patch --]
[-- Type: text/x-patch, Size: 2838 bytes --]

diff --git a/kernel-shark/src/libkshark.c b/kernel-shark/src/libkshark.c
index 4201fa02..ba0ab813 100644
--- a/kernel-shark/src/libkshark.c
+++ b/kernel-shark/src/libkshark.c
@@ -252,19 +252,19 @@ void kshark_free(struct kshark_context *kshark_ctx)
 	free(kshark_ctx);
 }
 
-static inline uint8_t knuth_hash(uint32_t val)
+static inline uint16_t knuth_hash(uint32_t val)
 {
 	/*
-	 * Small table hashing function adapted from Donald E. Knuth's 32 bit
+	 * Hashing function adapted from Donald E. Knuth's 32 bit
 	 * multiplicative hash.  See The Art of Computer Programming (TAOCP).
 	 * Multiplication by the Prime number, closest to the golden ratio of
-	 * 2^8.
+	 * 2^32.
 	 */
-	return UINT8_C(val) * UINT8_C(157);
+	return UINT16_C(val) * UINT16_C(65537);
 }
 
 static struct kshark_task_list *
-kshark_find_task(struct kshark_context *kshark_ctx, uint8_t key, int pid)
+kshark_find_task(struct kshark_context *kshark_ctx, uint32_t key, int pid)
 {
 	struct kshark_task_list *list;
 
@@ -276,13 +276,18 @@ kshark_find_task(struct kshark_context *kshark_ctx, uint8_t key, int pid)
 	return NULL;
 }
 
+static int hash_list[KS_TASK_HASH_SIZE];
+
 static struct kshark_task_list *
 kshark_add_task(struct kshark_context *kshark_ctx, int pid)
 {
 	struct kshark_task_list *list;
-	uint8_t key;
+	uint16_t key;
 
 	key = knuth_hash(pid);
+
+	hash_list[key]++;
+
 	list = kshark_find_task(kshark_ctx, key, pid);
 	if (list)
 		return list;
@@ -680,6 +685,30 @@ static void free_rec_list(struct rec_list **rec_list, int n_cpus,
 	free(rec_list);
 }
 
+static void print_hash(void)
+{
+	int min = -1, max = -1;
+	int total = 0, avg, std = 0;
+	int i;
+
+	for (i = 0; i < KS_TASK_HASH_SIZE; i++) {
+		if (min < 0 || hash_list[i] < min)
+			min = hash_list[i];
+		if (max < 0 || hash_list[i] > max)
+			max = hash_list[i];
+		total += hash_list[i];
+	}
+	avg = total / KS_TASK_HASH_SIZE;
+	for (i = 0; i < KS_TASK_HASH_SIZE; i++) {
+		int diff = hash_list[i] - avg;
+
+		std += diff * diff;
+	}
+	std /= KS_TASK_HASH_SIZE - 1;
+	printf("max=%d min=%d total=%d avg=%d std2=%d\n",
+	       max, min, total, avg, std);
+}
+
 static ssize_t get_records(struct kshark_context *kshark_ctx,
 			   struct rec_list ***rec_list, enum rec_type type)
 {
@@ -793,6 +822,7 @@ static ssize_t get_records(struct kshark_context *kshark_ctx,
 		total += count;
 	}
 
+	print_hash();
 	*rec_list = cpu_list;
 	return total;
 
diff --git a/kernel-shark/src/libkshark.h b/kernel-shark/src/libkshark.h
index 04e9cbfc..3407db19 100644
--- a/kernel-shark/src/libkshark.h
+++ b/kernel-shark/src/libkshark.h
@@ -72,7 +72,8 @@ struct kshark_entry {
 };
 
 /** Size of the task's hash table. */
-#define KS_TASK_HASH_SIZE 256
+#define KS_TASK_HASH_SHIFT 16
+#define KS_TASK_HASH_SIZE (1 << KS_TASK_HASH_SHIFT)
 
 /** Linked list of tasks. */
 struct kshark_task_list {

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH] kernel-shark: Increase the size of the task hash
  2019-08-29 15:49   ` Steven Rostedt
@ 2019-09-17 15:03     ` Yordan Karadzhov (VMware)
  0 siblings, 0 replies; 5+ messages in thread
From: Yordan Karadzhov (VMware) @ 2019-09-17 15:03 UTC (permalink / raw)
  To: Steven Rostedt; +Cc: Linux Trace Devel



On 29.08.19 г. 16:49 ч., Steven Rostedt wrote:
> On Thu, 29 Aug 2019 15:46:24 +0300
> "Yordan Karadzhov (VMware)" <y.karadz@gmail.com> wrote:
> 
>> On 28.08.19 г. 21:00 ч., Steven Rostedt wrote:
>>> From: Steven Rostedt (VMware) <rostedt@goodmis.org>
>>>
>>> When loading a data file that contained 100,000s of tasks, using a 256
>>> bucket size hash crippled it. By increasing the hash to 2^16 (65536) it
>>> solves the issue (still small enough not to waste too much memory).
>>>
>>> In the process, I changed the knuth_hash() in libkshark.c to use the 32
>>> bit version and just have the key use what it needs:
>>
>> If the size the hash table is 65536 then the multiplicative hashing
>> function has to multiply the key by a prime number which is closer to
>> 65536 / 1.61803398875 (the so called "golden ratio" of the size).
>> It makes no sense to multiply by a number that is several orders of
>> magnitude bigger than the size of the hash table.
>>
>>>
>>>     key = knuth_hash();
>>>     key += knuth_hash >> SHIFT;
>>>     key &= (1 << SHIFT) - 1;
>>>
>>> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
>>> ---
>>> diff --git a/kernel-shark/src/libkshark.c b/kernel-shark/src/libkshark.c
>>> index 4201fa02..41572e18 100644
>>> --- a/kernel-shark/src/libkshark.c
>>> +++ b/kernel-shark/src/libkshark.c
>>> @@ -252,19 +252,19 @@ void kshark_free(struct kshark_context *kshark_ctx)
>>>    	free(kshark_ctx);
>>>    }
>>>    
>>> -static inline uint8_t knuth_hash(uint32_t val)
>>> +static inline uint32_t knuth_hash(uint32_t val)
>>>    {
>>>    	/*
>>> -	 * Small table hashing function adapted from Donald E. Knuth's 32 bit
>>> +	 * Hashing function adapted from Donald E. Knuth's 32 bit
>>>    	 * multiplicative hash.  See The Art of Computer Programming (TAOCP).
>>>    	 * Multiplication by the Prime number, closest to the golden ratio of
>>> -	 * 2^8.
>>> +	 * 2^32.
>>>    	 */
>>> -	return UINT8_C(val) * UINT8_C(157);
>>> +	return val * UINT32_C(2654435761);

 From my tests it seems that the old and the new version of the hashing 
function are performing exactly the same however your version will be 
much easier to maintain, so I like this modification.


>>
>> So according to my understanding of the idea in the book, this line
>> should look look this:
>>
>> 	return UINT16_C(val) * UINT8_16(65537);
> 
> I just tested this (see attached patches, to try it yourself, I added
> the two libtraceevent fixes too). The results are *exactly* the same!
> 
> It's the prime number multiplication that is important.
> 
> They both give:
> 
> max=15441999 min=435 total=139895258 avg=2134 std2=55927
> 
> (since I didn't feel like adding a math library, the std2 is the square
> of the standard deviation. To get the real std you need to take the
> square root of it).
> 
> Note, its only the same if I remove the key += key >> shift part. With
> that still in, it becomes a little different:
> 
> max=15440573 min=334 total=139895258 avg=2134 std2=19938
> 
> It actually does better!
> 
>   std 141.201 (sqrt(19938)) vs 236.488 (sqrt(55927))
> 
> Thus, are you OK if I keep this patch as is?
> 
> -- Steve
> 
>>
>> and you do not need to do any additional manipulation of the key (PID).
>> Also the function has to return uint16_t.
>>
>> Thanks!
>> Yordan
>>
>>>    }
>>>    
>>>    static struct kshark_task_list *
>>> -kshark_find_task(struct kshark_context *kshark_ctx, uint8_t key, int pid)
>>> +kshark_find_task(struct kshark_context *kshark_ctx, uint32_t key, int pid)
>>>    {
>>>    	struct kshark_task_list *list;
>>>    
>>> @@ -280,9 +280,12 @@ static struct kshark_task_list *
>>>    kshark_add_task(struct kshark_context *kshark_ctx, int pid)
>>>    {
>>>    	struct kshark_task_list *list;
>>> -	uint8_t key;
>>> +	uint32_t key;
>>>    
>>>    	key = knuth_hash(pid);
>>> +	key += key >> KS_TASK_HASH_SHIFT;
 From my tests this so called "entropy" line makes the spread of the 
table worst. I would prefer to remove it.

>>> +	key &= (1 << KS_TASK_HASH_SHIFT) - 1;
The masking must stay but maybe it can be moved inside the hashing function.

Thanks a lot!
Yordan



>>> +
>>>    	list = kshark_find_task(kshark_ctx, key, pid);
>>>    	if (list)
>>>    		return list;
>>> diff --git a/kernel-shark/src/libkshark.h b/kernel-shark/src/libkshark.h
>>> index 04e9cbfc..3407db19 100644
>>> --- a/kernel-shark/src/libkshark.h
>>> +++ b/kernel-shark/src/libkshark.h
>>> @@ -72,7 +72,8 @@ struct kshark_entry {
>>>    };
>>>    
>>>    /** Size of the task's hash table. */
>>> -#define KS_TASK_HASH_SIZE 256
>>> +#define KS_TASK_HASH_SHIFT 16
>>> +#define KS_TASK_HASH_SIZE (1 << KS_TASK_HASH_SHIFT)
>>>    
>>>    /** Linked list of tasks. */
>>>    struct kshark_task_list {
>>>    
> 

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2019-09-17 15:03 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-08-28 18:00 [PATCH] kernel-shark: Increase the size of the task hash Steven Rostedt
2019-08-29 12:46 ` Yordan Karadzhov (VMware)
2019-08-29 13:18   ` Steven Rostedt
2019-08-29 15:49   ` Steven Rostedt
2019-09-17 15:03     ` Yordan Karadzhov (VMware)

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).