git.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] MinGW readdir reimplementation to support d_type
@ 2009-04-08 21:01 Marius Storm-Olsen
  2009-04-09 20:34 ` [msysGit] " Johannes Sixt
  0 siblings, 1 reply; 6+ messages in thread
From: Marius Storm-Olsen @ 2009-04-08 21:01 UTC (permalink / raw)
  To: git, msysgit; +Cc: Marius Storm-Olsen

The original readdir implementation was fast, but didn't
support the d_type. This means that git would do additional
lstats for each entry, to figure out if the entry was a
directory or not. This unneedingly slowed down many
operations, since Windows API provides this information
directly when walking the directories.

By running this implementation on Moe's repo structure:
  mkdir bummer && cd bummer; for ((i=0;i<100;i++)); do
    mkdir $i && pushd $i;
      for ((j=0;j<1000;j++)); do echo "$j" >$j; done;
    popd;
  done

We see the following speedups:
  git add .
  -------------------
  old: 00:00:23(.087)
  new: 00:00:21(.512) 1.07x

  git status
  -------------------
  old: 00:00:03(.306)
  new: 00:00:01(.684) 1.96x

  git clean -dxf
  -------------------
  old: 00:00:01(.918)
  new: 00:00:00(.295) 6.50x

Signed-off-by: Marius Storm-Olsen <marius@trolltech.com>
---
 It would be nice if MinGW/Windows people would give this a thorough
 testing to ensure that's it's pristine. It seems fine, and I've not
 stumbled over anything myself.

 Of course, if you have status.showUntrackedFiles = no, then you'll
 not get any speedups, since the read_directory_recursive loop is
 never entered. People with a standard setup, however, should
 experience a significant speedup.

 compat/mingw.c |   59 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 compat/mingw.h |   28 ++++++++++++++++++++++++++
 2 files changed, 87 insertions(+), 0 deletions(-)

diff --git a/compat/mingw.c b/compat/mingw.c
index 2839d9d..f52de3e 100644
--- a/compat/mingw.c
+++ b/compat/mingw.c
@@ -1139,3 +1139,62 @@ int link(const char *oldpath, const char *newpath)
 	}
 	return 0;
 }
+
+#ifndef NO_MINGW_REPLACE_READDIR
+/* MinGW readdir implementation to avoid extra lstats for Git */
+struct mingw_DIR
+{
+	struct _finddata_t	dd_dta;		/* disk transfer area for this dir */
+	struct mingw_dirent	dd_dir;		/* Our own implementation, including d_type */
+	long			dd_handle;	/* _findnext handle */
+	int			dd_stat; 	/* 0 = next entry to read is first entry, -1 = off the end, positive = 0 based index of next entry */
+	char			dd_name[1]; 	/* given path for dir with search pattern (struct is extended) */
+};
+
+struct dirent *mingw_readdir(DIR *dir)
+{
+	WIN32_FIND_DATAA buf;
+	HANDLE handle;
+	struct mingw_DIR *mdir = (struct mingw_DIR*)dir;
+
+	if (!dir->dd_handle) {
+		errno = EBADF; /* No set_errno for mingw */
+		return NULL;
+	}
+
+	if (dir->dd_handle == (long)INVALID_HANDLE_VALUE && dir->dd_stat == 0)
+	{
+		handle = FindFirstFileA(dir->dd_name, &buf);
+		DWORD lasterr = GetLastError();
+		dir->dd_handle = (long)handle;
+		if (handle == INVALID_HANDLE_VALUE && (lasterr != ERROR_NO_MORE_FILES)) {
+			errno = err_win_to_posix(lasterr);
+			return NULL;
+		}
+	} else if (dir->dd_handle == (long)INVALID_HANDLE_VALUE) {
+		return NULL;
+	} else if (!FindNextFileA((HANDLE)dir->dd_handle, &buf)) {
+		DWORD lasterr = GetLastError();
+		FindClose((HANDLE)dir->dd_handle);
+		dir->dd_handle = (long)INVALID_HANDLE_VALUE;
+		/* POSIX says you shouldn't set errno when readdir can't
+  		   find any more files; so, if another error we leave it set. */
+		if (lasterr != ERROR_NO_MORE_FILES)
+			errno = err_win_to_posix(lasterr);
+		return NULL;
+	}
+
+	/* We get here if `buf' contains valid data.  */
+	strcpy(dir->dd_dir.d_name, buf.cFileName);
+	++dir->dd_stat;
+
+	/* Set file type, based on WIN32_FIND_DATA */
+	mdir->dd_dir.d_type = 0;
+	if (buf.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY)
+		mdir->dd_dir.d_type |= DT_DIR;
+	else
+		mdir->dd_dir.d_type |= DT_REG;
+
+	return (struct dirent*)&dir->dd_dir;
+}
+#endif // !NO_MINGW_REPLACE_READDIR
diff --git a/compat/mingw.h b/compat/mingw.h
index 762eb14..104b310 100644
--- a/compat/mingw.h
+++ b/compat/mingw.h
@@ -233,3 +233,31 @@ int main(int argc, const char **argv) \
 	return mingw_main(argc, argv); \
 } \
 static int mingw_main(c,v)
+
+#ifndef NO_MINGW_REPLACE_READDIR
+/*
+ * A replacement of readdir, to ensure that it reads the file type at
+ * the same time. This avoid extra unneeded lstats in git on MinGW
+ */
+#undef DT_UNKNOWN
+#undef DT_DIR
+#undef DT_REG
+#undef DT_LNK
+#define DT_UNKNOWN	0
+#define DT_DIR		1
+#define DT_REG		2
+#define DT_LNK		3
+
+struct mingw_dirent
+{
+	long		d_ino;			/* Always zero. */
+	union {
+		unsigned short	d_reclen;	/* Always zero. */
+		unsigned char   d_type;		/* Reimplementation adds this */
+	};
+	unsigned short	d_namlen;		/* Length of name in d_name. */
+	char		d_name[FILENAME_MAX];	/* File name. */
+};
+#define dirent mingw_dirent
+#define readdir(x) mingw_readdir(x)
+#endif // !NO_MINGW_REPLACE_READDIR
--
1.6.2.2.472.gf61f7.dirty

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [msysGit] [PATCH] MinGW readdir reimplementation to support d_type
  2009-04-08 21:01 [PATCH] MinGW readdir reimplementation to support d_type Marius Storm-Olsen
@ 2009-04-09 20:34 ` Johannes Sixt
  2009-04-10  7:50   ` Marius Storm-Olsen
  0 siblings, 1 reply; 6+ messages in thread
From: Johannes Sixt @ 2009-04-09 20:34 UTC (permalink / raw)
  To: marius; +Cc: git, msysgit

Marius Storm-Olsen schrieb:
> The original readdir implementation was fast, but didn't
> support the d_type. This means that git would do additional
> lstats for each entry, to figure out if the entry was a
> directory or not. This unneedingly slowed down many
> operations, since Windows API provides this information
> directly when walking the directories.
> 
> By running this implementation on Moe's repo structure:
>   mkdir bummer && cd bummer; for ((i=0;i<100;i++)); do
>     mkdir $i && pushd $i;
>       for ((j=0;j<1000;j++)); do echo "$j" >$j; done;
>     popd;
>   done
> 
> We see the following speedups:
>   git add .
>   -------------------
>   old: 00:00:23(.087)
>   new: 00:00:21(.512) 1.07x
> 
>   git status
>   -------------------
>   old: 00:00:03(.306)
>   new: 00:00:01(.684) 1.96x
> 
>   git clean -dxf
>   -------------------
>   old: 00:00:01(.918)
>   new: 00:00:00(.295) 6.50x

Well done!

> +struct mingw_dirent
> +{
> +	long		d_ino;			/* Always zero. */
> +	union {
> +		unsigned short	d_reclen;	/* Always zero. */
> +		unsigned char   d_type;		/* Reimplementation adds this */
> +	};

VERY sneaky! I was wondering why you could get away without replacing 
opendir and closedir, and why you still defined a replacement mingw_DIR 
that contains the replacement mingw_dirent, until I noticed this unnamed 
union.

Since we don't use d_reclen anywhere in the code, wouldn't you get away with

#define d_type d_reclen

unless the type (short vs. char) makes a difference. Or would you say that 
doing that would be even more sneaky?

> +	unsigned short	d_namlen;		/* Length of name in d_name. */
> +	char		d_name[FILENAME_MAX];	/* File name. */
> +};
> +#define dirent mingw_dirent
> +#define readdir(x) mingw_readdir(x)
> +#endif // !NO_MINGW_REPLACE_READDIR

-- Hannes

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] MinGW readdir reimplementation to support d_type
  2009-04-09 20:34 ` [msysGit] " Johannes Sixt
@ 2009-04-10  7:50   ` Marius Storm-Olsen
  2009-04-11 21:44     ` Johannes Sixt
  0 siblings, 1 reply; 6+ messages in thread
From: Marius Storm-Olsen @ 2009-04-10  7:50 UTC (permalink / raw)
  To: j6t; +Cc: marius, git, msysgit


Johannes Sixt said the following on 09.04.2009 22:34:
> Marius Storm-Olsen schrieb:
>> +struct mingw_dirent
>> +{
>> +	long		d_ino;			/* Always zero. */
>> +	union {
>> +		unsigned short	d_reclen;	/* Always zero. */
>> +		unsigned char   d_type;		/* Reimplementation adds this */
>> +	};
> 
> VERY sneaky! I was wondering why you could get away without replacing
> opendir and closedir, and why you still defined a replacement
> mingw_DIR that contains the replacement mingw_dirent, until I noticed
> this unnamed union.
> 
> Since we don't use d_reclen anywhere in the code, wouldn't you get
> away with
> 
> #define d_type d_reclen
> 
> unless the type (short vs. char) makes a difference. Or would you say
> that doing that would be even more sneaky?

I'm sure it could be done just with a define. However, given the 
remaining unused variables, I was wondering about also packing in 
permission bits and file modification time in there, to optimize the 
status checking even further. That way, on Windows, we would only need 
one 'readdir' pass to check the whole repository, with no lstats 
whatsoever. So, this was patch was a 'primer' for that, hence the union 
with a proper uchar for the d_type.

However, that would also mean a significant change in the status 
checking code, as it first lstat's ever file in the index, then uses 
read_directory + lstat's for others. I guess that'll be too big of a 
change in core code, so the vision is moot?

I'd be ok to just use the define, provided that it compiles cleanly of 
course, if the above seems too ambitious. :-) I kinda feel like the 
current code is more clean though :)

--
.marius

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] MinGW readdir reimplementation to support d_type
  2009-04-10  7:50   ` Marius Storm-Olsen
@ 2009-04-11 21:44     ` Johannes Sixt
  2009-05-07 21:26       ` Heiko Voigt
  0 siblings, 1 reply; 6+ messages in thread
From: Johannes Sixt @ 2009-04-11 21:44 UTC (permalink / raw)
  To: Marius Storm-Olsen; +Cc: marius, git, msysgit


Marius Storm-Olsen schrieb:
> Johannes Sixt said the following on 09.04.2009 22:34:
>> Marius Storm-Olsen schrieb:
>>> +struct mingw_dirent
>>> +{
>>> +    long        d_ino;            /* Always zero. */
>>> +    union {
>>> +        unsigned short    d_reclen;    /* Always zero. */
>>> +        unsigned char   d_type;        /* Reimplementation adds this */
>>> +    };
>>
>> VERY sneaky! I was wondering why you could get away without replacing
>> opendir and closedir, and why you still defined a replacement
>> mingw_DIR that contains the replacement mingw_dirent, until I noticed
>> this unnamed union.
>>
>> Since we don't use d_reclen anywhere in the code, wouldn't you get
>> away with
>>
>> #define d_type d_reclen
>>
>> unless the type (short vs. char) makes a difference. Or would you say
>> that doing that would be even more sneaky?
> 
> I'm sure it could be done just with a define. However, given the 
> remaining unused variables, I was wondering about also packing in 
> permission bits and file modification time in there, to optimize the 
> status checking even further. That way, on Windows, we would only need 
> one 'readdir' pass to check the whole repository, with no lstats 
> whatsoever. So, this was patch was a 'primer' for that, hence the union 
> with a proper uchar for the d_type.
> 
> However, that would also mean a significant change in the status 
> checking code, as it first lstat's ever file in the index, then uses 
> read_directory + lstat's for others. I guess that'll be too big of a 
> change in core code, so the vision is moot?
> 
> I'd be ok to just use the define, provided that it compiles cleanly of 
> course, if the above seems too ambitious. :-) I kinda feel like the 
> current code is more clean though :)

With a comment in the commit message, it would have been clear, perhaps.

I'll carry this in my (private) tree for a while with the below squashed 
in to avoid a lot of warnings.

-- Hannes

diff --git a/compat/mingw.h b/compat/mingw.h
index 104b310..16ec76b 100644
--- a/compat/mingw.h
+++ b/compat/mingw.h
@@ -260,4 +260,5 @@ struct mingw_dirent
  };
  #define dirent mingw_dirent
  #define readdir(x) mingw_readdir(x)
+struct dirent *mingw_readdir(DIR *dir);
  #endif // !NO_MINGW_REPLACE_READDIR

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH] MinGW readdir reimplementation to support d_type
  2009-04-11 21:44     ` Johannes Sixt
@ 2009-05-07 21:26       ` Heiko Voigt
  2009-05-08  5:45         ` Marius Storm-Olsen
  0 siblings, 1 reply; 6+ messages in thread
From: Heiko Voigt @ 2009-05-07 21:26 UTC (permalink / raw)
  To: Johannes Sixt; +Cc: Marius Storm-Olsen, marius, git, msysgit

On Sat, Apr 11, 2009 at 11:44:58PM +0200, Johannes Sixt wrote:
> With a comment in the commit message, it would have been clear, perhaps.
>
> I'll carry this in my (private) tree for a while with the below squashed  
> in to avoid a lot of warnings.

What happened to this patch? Is there any reason it can not be included?
I can confirm the factor 2 speedup which is very noticeable. Especially
when working with "git gui" which does git status very often.

cheers Heiko

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] MinGW readdir reimplementation to support d_type
  2009-05-07 21:26       ` Heiko Voigt
@ 2009-05-08  5:45         ` Marius Storm-Olsen
  0 siblings, 0 replies; 6+ messages in thread
From: Marius Storm-Olsen @ 2009-05-08  5:45 UTC (permalink / raw)
  To: Heiko Voigt; +Cc: Johannes Sixt, git, msysgit

Heiko Voigt said the following on 07.05.2009 23:26:
> On Sat, Apr 11, 2009 at 11:44:58PM +0200, Johannes Sixt wrote:
>> With a comment in the commit message, it would have been clear,
>> perhaps.
>> 
>> I'll carry this in my (private) tree for a while with the below
>> squashed in to avoid a lot of warnings.
> 
> What happened to this patch? Is there any reason it can not be
> included? I can confirm the factor 2 speedup which is very
> noticeable. Especially when working with "git gui" which does git
> status very often.

Msysgit 1.6.3 (http://code.google.com/p/msysgit/downloads/list) contains 
this patch ontop of baseline 1.6.3. It hasn't made it into the mainline 
yet though. All in good time :-)

--
.marius

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2009-05-08  5:45 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-04-08 21:01 [PATCH] MinGW readdir reimplementation to support d_type Marius Storm-Olsen
2009-04-09 20:34 ` [msysGit] " Johannes Sixt
2009-04-10  7:50   ` Marius Storm-Olsen
2009-04-11 21:44     ` Johannes Sixt
2009-05-07 21:26       ` Heiko Voigt
2009-05-08  5:45         ` Marius Storm-Olsen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).