All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v3] http.c: use CURLOPT_RANGE for range requests
@ 2015-11-02 19:36 David Turner
  2015-11-02 20:18 ` Jeff King
  0 siblings, 1 reply; 6+ messages in thread
From: David Turner @ 2015-11-02 19:36 UTC (permalink / raw)
  To: git, peff; +Cc: David Turner

A HTTP server is permitted to return a non-range response to a HTTP
range request (and Apache httpd in fact does this in some cases).
While libcurl knows how to correctly handle this (by skipping bytes
before and after the requested range), it only turns on this handling
if it is aware that a range request is being made.  By manually
setting the range header instead of using CURLOPT_RANGE, we were
hiding the fact that this was a range request from libcurl.  This
could cause corruption.

Signed-off-by: David Turner <dturner@twopensource.com>
---

This version breaks the range option formatting/setting out to a
helper function, as suggested by Junio and Jeff.

In addition, it clears the range option when curl slots are cleared
before reuse, also as suggested

---
 http.c | 39 ++++++++++++++++++---------------------
 http.h |  1 -
 2 files changed, 18 insertions(+), 22 deletions(-)

diff --git a/http.c b/http.c
index 6b89dea..eea5836 100644
--- a/http.c
+++ b/http.c
@@ -30,7 +30,6 @@ static CURL *curl_default;
 #endif
 
 #define PREV_BUF_SIZE 4096
-#define RANGE_HEADER_SIZE 30
 
 char curl_errorstr[CURL_ERROR_SIZE];
 
@@ -692,6 +691,7 @@ struct active_request_slot *get_active_slot(void)
 	curl_easy_setopt(slot->curl, CURLOPT_UPLOAD, 0);
 	curl_easy_setopt(slot->curl, CURLOPT_HTTPGET, 1);
 	curl_easy_setopt(slot->curl, CURLOPT_FAILONERROR, 1);
+	curl_easy_setopt(slot->curl, CURLOPT_RANGE, NULL);
 #ifdef LIBCURL_CAN_HANDLE_AUTH_ANY
 	curl_easy_setopt(slot->curl, CURLOPT_HTTPAUTH, http_auth_methods);
 #endif
@@ -1184,6 +1184,19 @@ static const char *get_accept_language(void)
 	return cached_accept_language;
 }
 
+static void http_opt_request_remainder(CURL *curl, ssize_t lo)
+{
+	char buf[128];
+	int len = 0;
+
+	if (lo >= 0)
+		len += xsnprintf(buf + len, 128 - len, "%"PRIiMAX,
+				 (intmax_t)lo);
+	len += xsnprintf(buf + len, 128 - len, "-");
+
+	curl_easy_setopt(curl, CURLOPT_RANGE, buf);
+}
+
 /* http_request() targets */
 #define HTTP_REQUEST_STRBUF	0
 #define HTTP_REQUEST_FILE	1
@@ -1212,11 +1225,8 @@ static int http_request(const char *url,
 			long posn = ftell(result);
 			curl_easy_setopt(slot->curl, CURLOPT_WRITEFUNCTION,
 					 fwrite);
-			if (posn > 0) {
-				strbuf_addf(&buf, "Range: bytes=%ld-", posn);
-				headers = curl_slist_append(headers, buf.buf);
-				strbuf_reset(&buf);
-			}
+			if (posn > 0)
+				http_opt_request_remainder(slot->curl, posn);
 		} else
 			curl_easy_setopt(slot->curl, CURLOPT_WRITEFUNCTION,
 					 fwrite_buffer);
@@ -1526,10 +1536,6 @@ void release_http_pack_request(struct http_pack_request *preq)
 		fclose(preq->packfile);
 		preq->packfile = NULL;
 	}
-	if (preq->range_header != NULL) {
-		curl_slist_free_all(preq->range_header);
-		preq->range_header = NULL;
-	}
 	preq->slot = NULL;
 	free(preq->url);
 	free(preq);
@@ -1593,7 +1599,6 @@ struct http_pack_request *new_http_pack_request(
 	struct packed_git *target, const char *base_url)
 {
 	long prev_posn = 0;
-	char range[RANGE_HEADER_SIZE];
 	struct strbuf buf = STRBUF_INIT;
 	struct http_pack_request *preq;
 
@@ -1631,10 +1636,7 @@ struct http_pack_request *new_http_pack_request(
 			fprintf(stderr,
 				"Resuming fetch of pack %s at byte %ld\n",
 				sha1_to_hex(target->sha1), prev_posn);
-		xsnprintf(range, sizeof(range), "Range: bytes=%ld-", prev_posn);
-		preq->range_header = curl_slist_append(NULL, range);
-		curl_easy_setopt(preq->slot->curl, CURLOPT_HTTPHEADER,
-			preq->range_header);
+		http_opt_request_remainder(preq->slot->curl, prev_posn);
 	}
 
 	return preq;
@@ -1684,8 +1686,6 @@ struct http_object_request *new_http_object_request(const char *base_url,
 	char prev_buf[PREV_BUF_SIZE];
 	ssize_t prev_read = 0;
 	long prev_posn = 0;
-	char range[RANGE_HEADER_SIZE];
-	struct curl_slist *range_header = NULL;
 	struct http_object_request *freq;
 
 	freq = xcalloc(1, sizeof(*freq));
@@ -1791,10 +1791,7 @@ struct http_object_request *new_http_object_request(const char *base_url,
 			fprintf(stderr,
 				"Resuming fetch of object %s at byte %ld\n",
 				hex, prev_posn);
-		xsnprintf(range, sizeof(range), "Range: bytes=%ld-", prev_posn);
-		range_header = curl_slist_append(range_header, range);
-		curl_easy_setopt(freq->slot->curl,
-				 CURLOPT_HTTPHEADER, range_header);
+		http_opt_request_remainder(freq->slot->curl, prev_posn);
 	}
 
 	return freq;
diff --git a/http.h b/http.h
index 49afe39..4f97b60 100644
--- a/http.h
+++ b/http.h
@@ -190,7 +190,6 @@ struct http_pack_request {
 	struct packed_git **lst;
 	FILE *packfile;
 	char tmpfile[PATH_MAX];
-	struct curl_slist *range_header;
 	struct active_request_slot *slot;
 };
 
-- 
2.4.2.691.g714732c-twtrsrc

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH v3] http.c: use CURLOPT_RANGE for range requests
  2015-11-02 19:36 [PATCH v3] http.c: use CURLOPT_RANGE for range requests David Turner
@ 2015-11-02 20:18 ` Jeff King
  2015-11-02 21:50   ` Andreas Schwab
  0 siblings, 1 reply; 6+ messages in thread
From: Jeff King @ 2015-11-02 20:18 UTC (permalink / raw)
  To: David Turner; +Cc: git

On Mon, Nov 02, 2015 at 02:36:26PM -0500, David Turner wrote:

> A HTTP server is permitted to return a non-range response to a HTTP
> range request (and Apache httpd in fact does this in some cases).
> While libcurl knows how to correctly handle this (by skipping bytes
> before and after the requested range), it only turns on this handling
> if it is aware that a range request is being made.  By manually
> setting the range header instead of using CURLOPT_RANGE, we were
> hiding the fact that this was a range request from libcurl.  This
> could cause corruption.
> 
> Signed-off-by: David Turner <dturner@twopensource.com>
> ---
> 
> This version breaks the range option formatting/setting out to a
> helper function, as suggested by Junio and Jeff.
> 
> In addition, it clears the range option when curl slots are cleared
> before reuse, also as suggested

Thanks, this looks much nicer to me.

A few minor comments:

> +static void http_opt_request_remainder(CURL *curl, ssize_t lo)

I notice you used ssize_t here. If we are going to deal with large files, I
would think off_t would make more sense (i.e., to allow >2GB on a 32-bit
system).

But much worse than that, the value we are passing typically comes from
ftell(), which only returns a long. So we're truncated anyway in that
case.

I certainly don't think we are making anything _worse_ here; the problem
is in the existing code. But I don't think ssize_t is making anything
better (it's generally the same size as a long anyway). So I think I'd
prefer one of the following:

  1. Leave it as "long". At least then we are matching ftell(), which is
     clear (and works fine on 64-bit machines).

  2. Use off_t here instead. It doesn't fix the problem, but at least
     fixes our one component, so it's working towards a better solution
     in the long run.

  3. Detect and complain when we overflow the long. Hopefully ftell()
     returns -1 on a 32-bit system when the file is larger than 2GB, so
     this Just Works already, and we don't create a broken output.

  4. Fix all of the callers. I suspect this would involve calling
     fstat(fileno(fh)) to get a real off_t.

Options (3) and (4) are obviously more work, and I don't necessarily
expect you to do them. But I think I'd prefer (2) to what you have now.
Using off_t has an issue with being unsigned, but...

> +{
> +	char buf[128];
> +	int len = 0;
> +
> +	if (lo >= 0)
> +		len += xsnprintf(buf + len, 128 - len, "%"PRIiMAX,
> +				 (intmax_t)lo);
> +	len += xsnprintf(buf + len, 128 - len, "-");

I think we could just drop this "lo >= 0". Now that there is no "hi"
(which I think is fine), there's no reason to call the function at all
if you do not have a limit.

Also, we should prefer "sizeof(buf)" to repeating the "128", as the two
getting out of sync would be disastrous. So altogether something like:


  static void http_opt_request_remainder(CURL *curl, off_t pos)
  {
	char buf[128];
	xsnprintf(buf, sizeof(buf), "%"PRIuMAX", (uintmax_t)pos);
	curl_easy_setopt(curl, CURLOPT_RANGE, buf);
  }

would be enough, I think.

-Peff

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v3] http.c: use CURLOPT_RANGE for range requests
  2015-11-02 20:18 ` Jeff King
@ 2015-11-02 21:50   ` Andreas Schwab
  2015-11-02 22:10     ` Jeff King
  0 siblings, 1 reply; 6+ messages in thread
From: Andreas Schwab @ 2015-11-02 21:50 UTC (permalink / raw)
  To: Jeff King; +Cc: David Turner, git

Jeff King <peff@peff.net> writes:

>   4. Fix all of the callers. I suspect this would involve calling
>      fstat(fileno(fh)) to get a real off_t.

You can also use ftello which returns off_t.

Andreas.

-- 
Andreas Schwab, schwab@linux-m68k.org
GPG Key fingerprint = 58CA 54C7 6D53 942B 1756  01D3 44D5 214B 8276 4ED5
"And now for something completely different."

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v3] http.c: use CURLOPT_RANGE for range requests
  2015-11-02 21:50   ` Andreas Schwab
@ 2015-11-02 22:10     ` Jeff King
  2015-11-02 22:41       ` David Turner
  0 siblings, 1 reply; 6+ messages in thread
From: Jeff King @ 2015-11-02 22:10 UTC (permalink / raw)
  To: Andreas Schwab; +Cc: David Turner, git

On Mon, Nov 02, 2015 at 10:50:10PM +0100, Andreas Schwab wrote:

> Jeff King <peff@peff.net> writes:
> 
> >   4. Fix all of the callers. I suspect this would involve calling
> >      fstat(fileno(fh)) to get a real off_t.
> 
> You can also use ftello which returns off_t.

Thanks, I forgot about that function. That would probably be the sanest
path forward.

I think it is as simple as this (on top of David's patch):

-- >8 --
Subject: [PATCH] http: use off_t to store partial file size

When we try to resume transfer of a partially-downloaded
object or pack, we fopen() the existing file for append,
then use ftell() to get the current position. We use a
"long", which can hold only 2GB on a 32-bit system, even
though packfiles may be larger than that.

Let's switch to using off_t, which should hold any file size
our system is capable of storing. We need to use ftello() to
get the off_t. This is in POSIX and hopefully available
everywhere; if not, we should be able to wrap it by falling
back to ftell(), which would presumably return "-1" on such
a large file (and we would simply skip resuming in that case).

Signed-off-by: Jeff King <peff@peff.net>
---
 http.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/http.c b/http.c
index 5f36846..42f29ce 100644
--- a/http.c
+++ b/http.c
@@ -1205,7 +1205,7 @@ static int http_request(const char *url,
 		curl_easy_setopt(slot->curl, CURLOPT_FILE, result);
 
 		if (target == HTTP_REQUEST_FILE) {
-			long posn = ftell(result);
+			off_t posn = ftello(result);
 			curl_easy_setopt(slot->curl, CURLOPT_WRITEFUNCTION,
 					 fwrite);
 			if (posn > 0)
@@ -1581,7 +1581,7 @@ int finish_http_pack_request(struct http_pack_request *preq)
 struct http_pack_request *new_http_pack_request(
 	struct packed_git *target, const char *base_url)
 {
-	long prev_posn = 0;
+	off_t prev_posn = 0;
 	struct strbuf buf = STRBUF_INIT;
 	struct http_pack_request *preq;
 
@@ -1613,7 +1613,7 @@ struct http_pack_request *new_http_pack_request(
 	 * If there is data present from a previous transfer attempt,
 	 * resume where it left off
 	 */
-	prev_posn = ftell(preq->packfile);
+	prev_posn = ftello(preq->packfile);
 	if (prev_posn>0) {
 		if (http_is_verbose)
 			fprintf(stderr,
@@ -1668,7 +1668,7 @@ struct http_object_request *new_http_object_request(const char *base_url,
 	int prevlocal;
 	char prev_buf[PREV_BUF_SIZE];
 	ssize_t prev_read = 0;
-	long prev_posn = 0;
+	off_t prev_posn = 0;
 	struct http_object_request *freq;
 
 	freq = xcalloc(1, sizeof(*freq));
-- 
2.6.2.627.g377c1c5

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH v3] http.c: use CURLOPT_RANGE for range requests
  2015-11-02 22:10     ` Jeff King
@ 2015-11-02 22:41       ` David Turner
  2015-11-02 22:45         ` Jeff King
  0 siblings, 1 reply; 6+ messages in thread
From: David Turner @ 2015-11-02 22:41 UTC (permalink / raw)
  To: Jeff King; +Cc: Andreas Schwab, git

On Mon, 2015-11-02 at 17:10 -0500, Jeff King wrote:
> On Mon, Nov 02, 2015 at 10:50:10PM +0100, Andreas Schwab wrote:
> 
> > Jeff King <peff@peff.net> writes:
> > 
> > >   4. Fix all of the callers. I suspect this would involve calling
> > >      fstat(fileno(fh)) to get a real off_t.
> > 
> > You can also use ftello which returns off_t.
> 
> Thanks, I forgot about that function. That would probably be the sanest
> path forward.
> 
> I think it is as simple as this (on top of David's patch):
> 
> -- >8 --
> Subject: [PATCH] http: use off_t to store partial file size
> 
> When we try to resume transfer of a partially-downloaded
> object or pack, we fopen() the existing file for append,
> then use ftell() to get the current position. We use a
> "long", which can hold only 2GB on a 32-bit system, even
> though packfiles may be larger than that.
> 
> Let's switch to using off_t, which should hold any file size
> our system is capable of storing. We need to use ftello() to
> get the off_t. This is in POSIX and hopefully available
> everywhere; if not, we should be able to wrap it by falling
> back to ftell(), which would presumably return "-1" on such
> a large file (and we would simply skip resuming in that case).

It would skip resuming, but would still maybe write to the end of the
existing file, right?  So I think we would need to seek to the beginning
of the file in that case.

(other than that, lgtm)

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v3] http.c: use CURLOPT_RANGE for range requests
  2015-11-02 22:41       ` David Turner
@ 2015-11-02 22:45         ` Jeff King
  0 siblings, 0 replies; 6+ messages in thread
From: Jeff King @ 2015-11-02 22:45 UTC (permalink / raw)
  To: David Turner; +Cc: Andreas Schwab, git

On Mon, Nov 02, 2015 at 05:41:24PM -0500, David Turner wrote:

> > Let's switch to using off_t, which should hold any file size
> > our system is capable of storing. We need to use ftello() to
> > get the off_t. This is in POSIX and hopefully available
> > everywhere; if not, we should be able to wrap it by falling
> > back to ftell(), which would presumably return "-1" on such
> > a large file (and we would simply skip resuming in that case).
> 
> It would skip resuming, but would still maybe write to the end of the
> existing file, right?  So I think we would need to seek to the beginning
> of the file in that case.

Oh, you're right. That's no worse than the current behavior, but it's
definitely not ideal. I'd leave that for a follow-on patch that actually
implements such an ftello wrapper.

-Peff

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2015-11-02 22:46 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-11-02 19:36 [PATCH v3] http.c: use CURLOPT_RANGE for range requests David Turner
2015-11-02 20:18 ` Jeff King
2015-11-02 21:50   ` Andreas Schwab
2015-11-02 22:10     ` Jeff King
2015-11-02 22:41       ` David Turner
2015-11-02 22:45         ` Jeff King

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.