All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 05/13] list-objects-filter-large: add large blob filter to list-objects
@ 2017-09-22 20:30 Jeff Hostetler
  2017-09-22 20:30 ` [PATCH 06/13] list-objects-filter-sparse: add sparse-checkout based filter Jeff Hostetler
                   ` (7 more replies)
  0 siblings, 8 replies; 16+ messages in thread
From: Jeff Hostetler @ 2017-09-22 20:30 UTC (permalink / raw)
  To: git; +Cc: gitster, peff, jonathantanmy, Jeff Hostetler

From: Jeff Hostetler <jeffhost@microsoft.com>

Create a filter for traverse_commit_list_worker() to omit blobs
larger than a requested size from the result, but always include
".git*" special files.

Signed-off-by: Jeff Hostetler <jeffhost>@microsoft.com>
---
 Makefile                    |   1 +
 list-objects-filter-large.c | 108 ++++++++++++++++++++++++++++++++++++++++++++
 list-objects-filter-large.h |  18 ++++++++
 3 files changed, 127 insertions(+)
 create mode 100644 list-objects-filter-large.c
 create mode 100644 list-objects-filter-large.h

diff --git a/Makefile b/Makefile
index b98e3dc..f1f3979 100644
--- a/Makefile
+++ b/Makefile
@@ -799,6 +799,7 @@ LIB_OBJS += line-log.o
 LIB_OBJS += line-range.o
 LIB_OBJS += list-objects.o
 LIB_OBJS += list-objects-filter-all.o
+LIB_OBJS += list-objects-filter-large.o
 LIB_OBJS += ll-merge.o
 LIB_OBJS += lockfile.o
 LIB_OBJS += log-tree.o
diff --git a/list-objects-filter-large.c b/list-objects-filter-large.c
new file mode 100644
index 0000000..1af39b6
--- /dev/null
+++ b/list-objects-filter-large.c
@@ -0,0 +1,108 @@
+#include "cache.h"
+#include "dir.h"
+#include "tag.h"
+#include "commit.h"
+#include "tree.h"
+#include "blob.h"
+#include "diff.h"
+#include "tree-walk.h"
+#include "revision.h"
+#include "list-objects.h"
+#include "list-objects-filter-large.h"
+
+/*
+ * A filter for list-objects to omit large blobs,
+ * but always include ".git*" special files.
+ */
+struct filter_omit_large_blobs_data {
+	struct oidset2 omits;
+	int64_t max_bytes;
+};
+
+static list_objects_filter_result filter_omit_large_blobs(
+	list_objects_filter_type filter_type,
+	struct object *obj,
+	const char *pathname,
+	const char *filename,
+	void *filter_data_)
+{
+	struct filter_omit_large_blobs_data *filter_data = filter_data_;
+	int64_t object_length = -1;
+	unsigned long s;
+	enum object_type t;
+
+	switch (filter_type) {
+	default:
+		die("unkown filter_type");
+		return LOFR_ZERO;
+
+	case LOFT_BEGIN_TREE:
+		assert(obj->type == OBJ_TREE);
+		/* always include all tree objects */
+		return LOFR_MARK_SEEN | LOFR_SHOW;
+
+	case LOFT_END_TREE:
+		assert(obj->type == OBJ_TREE);
+		return LOFR_ZERO;
+
+	case LOFT_BLOB:
+		assert(obj->type == OBJ_BLOB);
+		assert((obj->flags & SEEN) == 0);
+
+		/*
+		 * If previously provisionally omitted (because of size), see if the
+		 * current filename is special and force it to be included.
+		 */
+		if (oidset2_contains(&filter_data->omits, &obj->oid)) {
+			if ((strncmp(filename, ".git", 4) == 0) && filename[4]) {
+				oidset2_remove(&filter_data->omits, &obj->oid);
+				return LOFR_MARK_SEEN | LOFR_SHOW;
+			}
+			return LOFR_ZERO; /* continue provisionally omitting it */
+		}
+
+		t = sha1_object_info(obj->oid.hash, &s);
+		assert(t == OBJ_BLOB);
+		object_length = (int64_t)((uint64_t)(s));
+
+		if (object_length < filter_data->max_bytes)
+			return LOFR_MARK_SEEN | LOFR_SHOW;
+
+		/*
+		 * Provisionally omit it.  We've already established that this blob
+		 * is too big and doesn't have a special filename, so we WANT to
+		 * omit it.  However, there may be a special file elsewhere in the
+		 * tree that references this same blob, so we cannot reject it yet.
+		 * Leave the LOFR_ bits unset so that if the blob appears again in
+		 * the traversal, we will be asked again.
+		 *
+		 * No need for a pathname, since we only test for special filenames
+		 * above.
+		 */
+		oidset2_insert(&filter_data->omits, &obj->oid, t, object_length,
+			       NULL);
+		return LOFR_ZERO;
+	}
+}
+
+void traverse_commit_list_omit_large_blobs(
+	struct rev_info *revs,
+	show_commit_fn show_commit,
+	show_object_fn show_object,
+	oidset2_foreach_cb print_omitted_object,
+	void *ctx_data,
+	int64_t large_byte_limit)
+{
+	struct filter_omit_large_blobs_data d;
+
+	memset(&d, 0, sizeof(d));
+	d.max_bytes = large_byte_limit;
+
+	traverse_commit_list_worker(revs, show_commit, show_object, ctx_data,
+				    filter_omit_large_blobs, &d);
+
+	if (print_omitted_object)
+		oidset2_foreach(&d.omits, print_omitted_object, ctx_data);
+
+	oidset2_clear(&d.omits);
+}
diff --git a/list-objects-filter-large.h b/list-objects-filter-large.h
new file mode 100644
index 0000000..4a5c772
--- /dev/null
+++ b/list-objects-filter-large.h
@@ -0,0 +1,18 @@
+#ifndef LIST_OBJECTS_FILTER_LARGE_H
+#define LIST_OBJECTS_FILTER_LARGE_H
+
+#include "oidset2.h"
+
+/*
+ * A filter for list-objects to omit large blobs,
+ * but always include ".git*" special files.
+ */
+void traverse_commit_list_omit_large_blobs(
+	struct rev_info *revs,
+	show_commit_fn show_commit,
+	show_object_fn show_object,
+	oidset2_foreach_cb print_omitted_object,
+	void *ctx_data,
+	int64_t large_byte_limit);
+
+#endif /* LIST_OBJECTS_FILTER_LARGE_H */
-- 
2.9.3


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 06/13] list-objects-filter-sparse: add sparse-checkout based filter
  2017-09-22 20:30 [PATCH 05/13] list-objects-filter-large: add large blob filter to list-objects Jeff Hostetler
@ 2017-09-22 20:30 ` Jeff Hostetler
  2017-09-22 20:30 ` [PATCH 07/13] object-filter: common declarations for object filtering Jeff Hostetler
                   ` (6 subsequent siblings)
  7 siblings, 0 replies; 16+ messages in thread
From: Jeff Hostetler @ 2017-09-22 20:30 UTC (permalink / raw)
  To: git; +Cc: gitster, peff, jonathantanmy, Jeff Hostetler

From: Jeff Hostetler <jeffhost@microsoft.com>

Create a filter for traverse_commit_list_worker() to only include
the blobs the would be referenced by a sparse-checkout using the
given specification.

A future enhancement should be able to also omit unneeded tree
objects, but that is not currently supported.

Signed-off-by: Jeff Hostetler <jeffhost@microsoft.com>
---
 Makefile                     |   1 +
 list-objects-filter-sparse.c | 221 +++++++++++++++++++++++++++++++++++++++++++
 list-objects-filter-sparse.h |  30 ++++++
 3 files changed, 252 insertions(+)
 create mode 100644 list-objects-filter-sparse.c
 create mode 100644 list-objects-filter-sparse.h

diff --git a/Makefile b/Makefile
index f1f3979..6e0bd91 100644
--- a/Makefile
+++ b/Makefile
@@ -800,6 +800,7 @@ LIB_OBJS += line-range.o
 LIB_OBJS += list-objects.o
 LIB_OBJS += list-objects-filter-all.o
 LIB_OBJS += list-objects-filter-large.o
+LIB_OBJS += list-objects-filter-sparse.o
 LIB_OBJS += ll-merge.o
 LIB_OBJS += lockfile.o
 LIB_OBJS += log-tree.o
diff --git a/list-objects-filter-sparse.c b/list-objects-filter-sparse.c
new file mode 100644
index 0000000..59155d5
--- /dev/null
+++ b/list-objects-filter-sparse.c
@@ -0,0 +1,221 @@
+#include "cache.h"
+#include "dir.h"
+#include "tag.h"
+#include "commit.h"
+#include "tree.h"
+#include "blob.h"
+#include "diff.h"
+#include "tree-walk.h"
+#include "revision.h"
+#include "list-objects.h"
+#include "list-objects-filter-sparse.h"
+
+/*
+ * A filter driven by a sparse-checkout specification to only
+ * include blobs that a sparse checkout would populate.
+ *
+ * The sparse-checkout spec can be loaded from a blob with the
+ * given OID or from a local pathname.  We allow an OID because
+ * the repo may be bare or we may be doing the filtering on the
+ * server.
+ */
+struct frame {
+	int defval;
+	int child_prov_omit : 1;
+};
+
+struct filter_use_sparse_data {
+	struct oidset2 omits;
+	struct exclude_list el;
+
+	size_t nr, alloc;
+	struct frame *array_frame;
+};
+
+static list_objects_filter_result filter_use_sparse(
+	list_objects_filter_type filter_type,
+	struct object *obj,
+	const char *pathname,
+	const char *filename,
+	void *filter_data_)
+{
+	struct filter_use_sparse_data *filter_data = filter_data_;
+	int64_t object_length = -1;
+	int val, dtype;
+	unsigned long s;
+	enum object_type t;
+	struct frame *frame;
+
+	switch (filter_type) {
+	default:
+		die("unkown filter_type");
+		return LOFR_ZERO;
+
+	case LOFT_BEGIN_TREE:
+		assert(obj->type == OBJ_TREE);
+		dtype = DT_DIR;
+		val = is_excluded_from_list(pathname, strlen(pathname),
+					    filename, &dtype, &filter_data->el,
+					    &the_index);
+		if (val < 0)
+			val = filter_data->array_frame[filter_data->nr].defval;
+
+		ALLOC_GROW(filter_data->array_frame, filter_data->nr + 1,
+			   filter_data->alloc);
+		filter_data->nr++;
+		filter_data->array_frame[filter_data->nr].defval = val;
+		filter_data->array_frame[filter_data->nr].child_prov_omit = 0;
+
+		/*
+		 * A directory with this tree OID may appear in multiple
+		 * places in the tree. (Think of a directory move, with
+		 * no other changes.)  And with a different pathname, the
+		 * is_excluded...() results for this directory and items
+		 * contained within it may be different.  So we cannot
+		 * mark it SEEN (yet), since that will prevent process_tree()
+		 * from revisiting this tree object with other pathnames.
+		 *
+		 * Only SHOW the tree object the first time we visit this
+		 * tree object.
+		 *
+		 * We always show all tree objects.  A future optimization
+		 * may want to attempt to narrow this.
+		 */
+		if (obj->flags & FILTER_REVISIT)
+			return LOFR_ZERO;
+		obj->flags |= FILTER_REVISIT;
+		return LOFR_SHOW;
+
+	case LOFT_END_TREE:
+		assert(obj->type == OBJ_TREE);
+		assert(filter_data->nr > 0);
+
+		frame = &filter_data->array_frame[filter_data->nr];
+		filter_data->nr--;
+
+		/*
+		 * Tell our parent directory if any of our children were
+		 * provisionally omitted.
+		 */
+		filter_data->array_frame[filter_data->nr].child_prov_omit |=
+			frame->child_prov_omit;
+
+		/*
+		 * If there are NO provisionally omitted child objects (ALL child
+		 * objects in this folder were INCLUDED), then we can mark the
+		 * folder as SEEN (so we will not have to revisit it again).
+		 */
+		if (!frame->child_prov_omit)
+			return LOFR_MARK_SEEN;
+		return LOFR_ZERO;
+
+	case LOFT_BLOB:
+		assert(obj->type == OBJ_BLOB);
+		assert((obj->flags & SEEN) == 0);
+
+		frame = &filter_data->array_frame[filter_data->nr];
+
+		/*
+		 * If we previously provisionally omitted this blob because
+		 * its pathname was not in the sparse-checkout AND this
+		 * reference to the blob has the same pathname, we can avoid
+		 * repeating the exclusion logic on this pathname and just
+		 * continue to provisionally omit it.
+		 */
+		if (obj->flags & FILTER_REVISIT) {
+			struct oidset2_entry *entry_prev;
+			entry_prev = oidset2_get(&filter_data->omits, &obj->oid);
+			if (entry_prev && !strcmp(pathname, entry_prev->pathname)) {
+				frame->child_prov_omit = 1;
+				return LOFR_ZERO;
+			}
+		}
+
+		dtype = DT_REG;
+		val = is_excluded_from_list(pathname, strlen(pathname),
+					    filename, &dtype, &filter_data->el,
+					    &the_index);
+		if (val < 0)
+			val = frame->defval;
+		if (val > 0)
+			return LOFR_MARK_SEEN | LOFR_SHOW;
+
+		t = sha1_object_info(obj->oid.hash, &s);
+		assert(t == OBJ_BLOB);
+		object_length = (int64_t)((uint64_t)(s));
+
+		/*
+		 * Provisionally omit it.  We've already established that
+		 * this pathname is not in the sparse-checkout specification,
+		 * so we WANT to omit this blob.  However, a pathname elsewhere
+		 * in the tree may also reference this same blob, so we cannot
+		 * reject it yet.  Leave the LOFR_ bits unset so that if the
+		 * blob appears again in the traversal, we will be asked again.
+		 *
+		 * The pathname we associate with this omit is just the first
+		 * one we saw for this blob.  Other instances of this blob may
+		 * have other pathnames and that is fine.  We just use it for
+		 * perf because most of the time, the blob will be in the same
+		 * place as we walk the commits.
+		 */
+		oidset2_insert(&filter_data->omits, &obj->oid, t, object_length,
+			       pathname);
+		obj->flags |= FILTER_REVISIT;
+		frame->child_prov_omit = 1;
+		return LOFR_ZERO;
+	}
+}
+
+void traverse_commit_list_use_blob(
+	struct rev_info *revs,
+	show_commit_fn show_commit,
+	show_object_fn show_object,
+	oidset2_foreach_cb print_omitted_object,
+	void *ctx_data,
+	struct object_id *oid)
+{
+	struct filter_use_sparse_data d;
+
+	memset(&d, 0, sizeof(d));
+	if (add_excludes_from_blob_to_list(oid, NULL, 0, &d.el) < 0)
+		die("could not load filter specification");
+
+	ALLOC_GROW(d.array_frame, d.nr + 1, d.alloc);
+	d.array_frame[d.nr].defval = 0; /* default to include */
+	d.array_frame[d.nr].child_prov_omit = 0;
+
+	traverse_commit_list_worker(revs, show_commit, show_object, ctx_data,
+				    filter_use_sparse, &d);
+
+	if (print_omitted_object)
+		oidset2_foreach(&d.omits, print_omitted_object, ctx_data);
+
+	oidset2_clear(&d.omits);
+}
+
+void traverse_commit_list_use_path(
+	struct rev_info *revs,
+	show_commit_fn show_commit,
+	show_object_fn show_object,
+	oidset2_foreach_cb print_omitted_object,
+	void *ctx_data,
+	const char *path)
+{
+	struct filter_use_sparse_data d;
+
+	memset(&d, 0, sizeof(d));
+	if (add_excludes_from_file_to_list(path, NULL, 0, &d.el, NULL) < 0)
+		die("could not load filter specification");
+
+	ALLOC_GROW(d.array_frame, d.nr + 1, d.alloc);
+	d.array_frame[d.nr].defval = 0; /* default to include */
+	d.array_frame[d.nr].child_prov_omit = 0;
+
+	traverse_commit_list_worker(revs, show_commit, show_object, ctx_data,
+				    filter_use_sparse, &d);
+
+	if (print_omitted_object)
+		oidset2_foreach(&d.omits, print_omitted_object, ctx_data);
+
+	oidset2_clear(&d.omits);
+}
diff --git a/list-objects-filter-sparse.h b/list-objects-filter-sparse.h
new file mode 100644
index 0000000..aa89390
--- /dev/null
+++ b/list-objects-filter-sparse.h
@@ -0,0 +1,30 @@
+#ifndef LIST_OBJECTS_FILTERS_SPARSE_H
+#define LIST_OBJECTS_FILTERS_SPARSE_H
+
+#include "oidset2.h"
+
+/*
+ * A filter driven by a sparse-checkout specification to only
+ * include blobs that a sparse checkout would populate.
+ *
+ * The sparse-checkout spec can be loaded from a blob with the
+ * given OID, a blob with a blob-ish path, or from a local pathname.
+ * We allow an OID because the repo may be bare or we may be doing
+ * the filtering on the server.
+ */
+void traverse_commit_list_use_blob(
+	struct rev_info *revs,
+	show_commit_fn show_commit,
+	show_object_fn show_object,
+	oidset2_foreach_cb print_omitted_object,
+	void *ctx_data,
+	struct object_id *oid);
+void traverse_commit_list_use_path(
+	struct rev_info *revs,
+	show_commit_fn show_commit,
+	show_object_fn show_object,
+	oidset2_foreach_cb print_omitted_object,
+	void *ctx_data,
+	const char *path);
+
+#endif /* LIST_OBJECTS_FILTERS_SPARSE_H */
-- 
2.9.3


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 07/13] object-filter: common declarations for object filtering
  2017-09-22 20:30 [PATCH 05/13] list-objects-filter-large: add large blob filter to list-objects Jeff Hostetler
  2017-09-22 20:30 ` [PATCH 06/13] list-objects-filter-sparse: add sparse-checkout based filter Jeff Hostetler
@ 2017-09-22 20:30 ` Jeff Hostetler
  2017-09-26 22:39   ` Jonathan Tan
  2017-09-22 20:30 ` [PATCH 08/13] list-objects: add traverse_commit_list_filtered method Jeff Hostetler
                   ` (5 subsequent siblings)
  7 siblings, 1 reply; 16+ messages in thread
From: Jeff Hostetler @ 2017-09-22 20:30 UTC (permalink / raw)
  To: git; +Cc: gitster, peff, jonathantanmy, Jeff Hostetler

From: Jeff Hostetler <jeffhost@microsoft.com>

Create common routines and defines for parsing
object-filter-related command line arguments and
pack-protocol fields.

Signed-off-by: Jeff Hostetler <jeffhost@microsoft.com>
---
 Makefile        |   1 +
 object-filter.c | 269 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 object-filter.h | 173 ++++++++++++++++++++++++++++++++++++
 3 files changed, 443 insertions(+)
 create mode 100644 object-filter.c
 create mode 100644 object-filter.h

diff --git a/Makefile b/Makefile
index 6e0bd91..bddd6b5 100644
--- a/Makefile
+++ b/Makefile
@@ -818,6 +818,7 @@ LIB_OBJS += notes-cache.o
 LIB_OBJS += notes-merge.o
 LIB_OBJS += notes-utils.o
 LIB_OBJS += object.o
+LIB_OBJS += object-filter.o
 LIB_OBJS += oidset.o
 LIB_OBJS += oidset2.o
 LIB_OBJS += pack-bitmap.o
diff --git a/object-filter.c b/object-filter.c
new file mode 100644
index 0000000..c88f79f
--- /dev/null
+++ b/object-filter.c
@@ -0,0 +1,269 @@
+#include "cache.h"
+#include "commit.h"
+#include "config.h"
+#include "revision.h"
+#include "list-objects.h"
+#include "oidset2.h"
+#include "list-objects-filter-all.h"
+#include "list-objects-filter-large.h"
+#include "list-objects-filter-sparse.h"
+#include "object-filter.h"
+
+int parse_filter_omit_all_blobs(struct object_filter_options *filter_options)
+{
+	if (object_filter_enabled(filter_options))
+		die(_("multiple object filter types cannot be combined"));
+
+	filter_options->omit_all_blobs = 1;
+	return 0;
+}
+
+int parse_filter_omit_large_blobs(struct object_filter_options *filter_options,
+				  const char *arg)
+{
+	if (object_filter_enabled(filter_options))
+		die(_("multiple object filter types cannot be combined"));
+
+	filter_options->omit_large_blobs = 1;
+
+	/* we allow "<digits>[kmg]" */
+	if (!git_parse_ulong(arg, &filter_options->large_byte_limit))
+		die(_("invalid size limit for large object filter"));
+
+	filter_options->large_byte_limit_string = strdup(arg);
+	return 0;
+}
+
+int parse_filter_use_blob(struct object_filter_options *filter_options,
+			  const char *arg)
+{
+	struct object_context oc;
+
+	if (object_filter_enabled(filter_options))
+		die(_("multiple object filter types cannot be combined"));
+
+	filter_options->use_blob = 1;
+
+	/*
+	 * The command line argument needs to resolve to an known OID
+	 * representing the content of the desired sparse-checkout file.
+	 *
+	 * We allow various syntax forms for the convenience of the user.
+	 * See sha1_name.c:get_sha1_with_context_1().
+	 *
+	 * Try to evaluate the arg locally in case they use one of the
+	 * convenience patterns.  This must resolve to a blob.
+	 */
+	if (get_sha1_with_context(arg, GET_SHA1_BLOB,
+				  filter_options->sparse_oid.hash, &oc)) {
+		/*
+		 * If that fails, keep the original string in case a client
+		 * command wants to send it to the server.  This allows the
+		 * client to name an OID for a blob they don't have.
+		 */
+		filter_options->sparse_value = strdup(arg);
+		oidcpy(&filter_options->sparse_oid, &null_oid);
+	} else {
+		/*
+		 * Round-trip the found OID to normalize it.
+		 */
+		filter_options->sparse_value =
+			strdup(oid_to_hex(&filter_options->sparse_oid));
+	}
+	return 0;
+}
+
+int parse_filter_use_path(struct object_filter_options *filter_options,
+			  const char *arg)
+{
+	if (object_filter_enabled(filter_options))
+		die(_("multiple object filter types cannot be combined"));
+
+	filter_options->use_path = 1;
+
+	/*
+	 * The command line argument needs to resolve to a local file
+	 * containing the content of the desired sparse-checkout file.
+	 */
+	filter_options->sparse_value = strdup(arg);
+	return 0;
+}
+
+int parse_filter_print_omitted(struct object_filter_options *filter_options)
+{
+	filter_options->print_omitted = 1;
+	return 0;
+}
+
+int parse_filter_print_missing(struct object_filter_options *filter_options)
+{
+	filter_options->print_missing = 1;
+	return 0;
+}
+
+int parse_filter_relax(struct object_filter_options *filter_options)
+{
+	filter_options->relax = 1;
+	return 0;
+}
+
+int opt_parse_filter_omit_all_blobs(const struct option *opt,
+				    const char *arg, int unset)
+{
+	struct object_filter_options *filter_options = opt->value;
+
+	assert(!arg);
+	assert(!unset);
+
+	return parse_filter_omit_all_blobs(filter_options);
+}
+
+int opt_parse_filter_omit_large_blobs(const struct option *opt,
+				      const char *arg, int unset)
+{
+	struct object_filter_options *filter_options = opt->value;
+
+	assert(arg);
+	assert(!unset);
+
+	return parse_filter_omit_large_blobs(filter_options, arg);
+}
+
+int opt_parse_filter_use_blob(const struct option *opt,
+			      const char *arg, int unset)
+{
+	struct object_filter_options *filter_options = opt->value;
+
+	assert(arg);
+	assert(!unset);
+
+	return parse_filter_use_blob(filter_options, arg);
+}
+
+int opt_parse_filter_use_path(const struct option *opt,
+			      const char *arg, int unset)
+{
+	struct object_filter_options *filter_options = opt->value;
+
+	assert(arg);
+	assert(!unset);
+
+	return parse_filter_use_path(filter_options, arg);
+}
+
+int opt_parse_filter_print_omitted(const struct option *opt,
+				   const char *arg, int unset)
+{
+	struct object_filter_options *filter_options = opt->value;
+
+	assert(!arg);
+	assert(!unset);
+
+	return parse_filter_print_omitted(filter_options);
+}
+
+int opt_parse_filter_print_missing(const struct option *opt,
+				   const char *arg, int unset)
+{
+	struct object_filter_options *filter_options = opt->value;
+
+	assert(!arg);
+	assert(!unset);
+
+	return parse_filter_print_missing(filter_options);
+}
+
+int opt_parse_filter_relax(const struct option *opt,
+			   const char *arg, int unset)
+{
+	struct object_filter_options *filter_options = opt->value;
+
+	assert(!arg);
+	assert(!unset);
+
+	return parse_filter_relax(filter_options);
+}
+
+int object_filter_hand_parse_arg(struct object_filter_options *filter_options,
+				 const char *arg,
+				 int allow_print_omitted,
+				 int allow_print_missing,
+				 int allow_relax)
+{
+	if (!strcmp(arg, ("--"CL_ARG_FILTER_OMIT_ALL_BLOBS))) {
+		parse_filter_omit_all_blobs(filter_options);
+		return 1;
+	}
+	if (skip_prefix(arg, ("--"CL_ARG_FILTER_OMIT_LARGE_BLOBS"="), &arg)) {
+		parse_filter_omit_large_blobs(filter_options, arg);
+		return 1;
+	}
+	if (skip_prefix(arg, ("--"CL_ARG_FILTER_USE_BLOB"="), &arg)) {
+		parse_filter_use_blob(filter_options, arg);
+		return 1;
+	}
+	if (skip_prefix(arg, ("--"CL_ARG_FILTER_USE_PATH"="), &arg)) {
+		parse_filter_use_path(filter_options, arg);
+		return 1;
+	}
+
+	if (allow_print_omitted &&
+	    !strcmp(arg, ("--"CL_ARG_FILTER_PRINT_OMITTED))) {
+		parse_filter_print_omitted(filter_options);
+		return 1;
+	}
+
+	if (allow_print_missing &&
+	    !strcmp(arg, ("--"CL_ARG_FILTER_PRINT_MISSING))) {
+		parse_filter_print_missing(filter_options);
+		return 1;
+	}
+
+	if (allow_relax && !strcmp(arg, ("--"CL_ARG_FILTER_RELAX))) {
+		parse_filter_relax(filter_options);
+		return 1;
+	}
+
+	return 0;
+}
+
+int object_filter_hand_parse_protocol(struct object_filter_options *filter_options,
+				      const char *arg,
+				      int allow_print_omitted,
+				      int allow_print_missing,
+				      int allow_relax)
+{
+	if (!strcmp(arg, CL_ARG_FILTER_OMIT_ALL_BLOBS)) {
+		parse_filter_omit_all_blobs(filter_options);
+		return 1;
+	}
+	if (skip_prefix(arg, (CL_ARG_FILTER_OMIT_LARGE_BLOBS" "), &arg)) {
+		parse_filter_omit_large_blobs(filter_options, arg);
+		return 1;
+	}
+	if (skip_prefix(arg, (CL_ARG_FILTER_USE_BLOB" "), &arg)) {
+		parse_filter_use_blob(filter_options, arg);
+		return 1;
+	}
+	if (skip_prefix(arg, (CL_ARG_FILTER_USE_PATH" "), &arg)) {
+		parse_filter_use_path(filter_options, arg);
+		return 1;
+	}
+
+	if (allow_print_omitted &&
+	    !strcmp(arg, CL_ARG_FILTER_PRINT_OMITTED)) {
+		parse_filter_print_omitted(filter_options);
+		return 1;
+	}
+	if (allow_print_missing &&
+	    !strcmp(arg, CL_ARG_FILTER_PRINT_MISSING)) {
+		parse_filter_print_missing(filter_options);
+		return 1;
+	}
+	if (allow_relax && !strcmp(arg, CL_ARG_FILTER_RELAX)) {
+		parse_filter_relax(filter_options);
+		return 1;
+	}
+
+	return 0;
+}
diff --git a/object-filter.h b/object-filter.h
new file mode 100644
index 0000000..13a638b
--- /dev/null
+++ b/object-filter.h
@@ -0,0 +1,173 @@
+#ifndef OBJECT_FILTER_H
+#define OBJECT_FILTER_H
+
+#include "parse-options.h"
+
+/*
+ * Common declarations and utilities for filtering objects (such as omitting
+ * large blobs) in list_objects:traverse_commit_list() and git-rev-list.
+ */
+
+struct object_filter_options {
+	/*
+	 * File pathname or blob-ish path/OID (that get_sha1_with_context() can
+	 * use to find the blob containing the sparse-checkout specification.
+	 * This is only used when use_blob or use_path is set.
+	 */
+	const char *sparse_value;
+	struct object_id sparse_oid;
+
+	/*
+	 * Blob size byte limit for filtering.  Only blobs smaller than this
+	 * value will be included.  A value of zero, omits all blobs.
+	 * only used when omit_large_blobs is set.  Integer and string versions
+	 * of this are kept for convenience.  The string version may contain
+	 * a [kmg] suffix.
+	 */
+	unsigned long large_byte_limit;
+	const char *large_byte_limit_string;
+
+	/* Valid filter types (only one may be used at a time) */
+	unsigned omit_all_blobs : 1;
+	unsigned omit_large_blobs : 1;
+	unsigned use_blob : 1;
+	unsigned use_path : 1;
+
+	/*
+	 * True if rev-list should print a list of the objects omitted
+	 * by this invocation of a filter.
+	 */
+	unsigned print_omitted : 1;
+
+	/*
+	 * True if rev-list should print a list of missing objects.
+	 * Objects can be missing because of a previously filtered
+	 * clone or fetch. The set reported here can also be filtered
+	 * by the current filter in effect.
+	 */
+	unsigned print_missing : 1;
+
+	/* True to suppress missing object errors during consistency checks */
+	unsigned relax : 1;
+};
+
+/*
+ * Return true if a filter is enabled.
+ */
+inline int object_filter_enabled(const struct object_filter_options *p)
+{
+	return p->omit_all_blobs ||
+		p->omit_large_blobs ||
+		p->use_blob ||
+		p->use_path;
+}
+
+/* Normalized command line arguments */
+#define CL_ARG_FILTER_OMIT_ALL_BLOBS     "filter-omit-all-blobs"
+#define CL_ARG_FILTER_OMIT_LARGE_BLOBS   "filter-omit-large-blobs"
+#define CL_ARG_FILTER_USE_BLOB           "filter-use-blob"
+#define CL_ARG_FILTER_USE_PATH           "filter-use-path"
+#define CL_ARG_FILTER_PRINT_OMITTED      "filter-print-omitted"
+#define CL_ARG_FILTER_PRINT_MISSING      "filter-print-missing"
+#define CL_ARG_FILTER_RELAX              "filter-relax"
+
+/*
+ * Common command line argument parsing for object-filter-related
+ * arguments (whether from a hand-parsed or parse-options style
+ * parser.
+ */
+int parse_filter_omit_all_blobs(struct object_filter_options *filter_options);
+int parse_filter_omit_large_blobs(struct object_filter_options *filter_options,
+				  const char *arg);
+int parse_filter_use_blob(struct object_filter_options *filter_options,
+			  const char *arg);
+int parse_filter_use_path(struct object_filter_options *filter_options,
+			  const char *arg);
+int parse_filter_print_omitted(struct object_filter_options *filter_options);
+int parse_filter_print_missing(struct object_filter_options *filter_options);
+int parse_filter_relax(struct object_filter_options *filter_options);
+
+/*
+ * Common command line argument parsers for object-filter-related
+ * arguments comming from parse-options style parsers.
+ */
+
+int opt_parse_filter_omit_all_blobs(const struct option *opt,
+				    const char *arg, int unset);
+int opt_parse_filter_omit_large_blobs(const struct option *opt,
+				      const char *arg, int unset);
+int opt_parse_filter_use_blob(const struct option *opt,
+			      const char *arg, int unset);
+int opt_parse_filter_use_path(const struct option *opt,
+			      const char *arg, int unset);
+int opt_parse_filter_print_omitted(const struct option *opt,
+				   const char *arg, int unset);
+int opt_parse_filter_print_missing(const struct option *opt,
+				   const char *arg, int unset);
+int opt_parse_filter_relax(const struct option *opt,
+			   const char *arg, int unset);
+
+#define OPT_PARSE_FILTER_OMIT_ALL_BLOBS(fo) \
+	{ OPTION_CALLBACK, 0, CL_ARG_FILTER_OMIT_ALL_BLOBS, fo, NULL, \
+	  N_("omit all blobs from result"), PARSE_OPT_NOARG | PARSE_OPT_NONEG, \
+	  opt_parse_filter_omit_all_blobs }
+
+#define OPT_PARSE_FILTER_OMIT_LARGE_BLOBS(fo) \
+	{ OPTION_CALLBACK, 0, CL_ARG_FILTER_OMIT_LARGE_BLOBS, fo, N_("size"), \
+	  N_("omit large blobs from result"), PARSE_OPT_NONEG, \
+	  opt_parse_filter_omit_large_blobs }
+
+#define OPT_PARSE_FILTER_USE_BLOB(fo) \
+	{ OPTION_CALLBACK, 0, CL_ARG_FILTER_USE_BLOB, fo, N_("object"), \
+	  N_("filter results using sparse-checkout specification"), PARSE_OPT_NONEG, \
+	  opt_parse_filter_use_blob }
+
+#define OPT_PARSE_FILTER_USE_PATH(fo) \
+	{ OPTION_CALLBACK, 0, CL_ARG_FILTER_USE_PATH, fo, N_("path"), \
+	  N_("filter results using sparse-checkout specification"), PARSE_OPT_NONEG, \
+	  opt_parse_filter_use_path }
+
+#define OPT_PARSE_FILTER_PRINT_OMITTED(fo) \
+	{ OPTION_CALLBACK, 0, CL_ARG_FILTER_PRINT_OMITTED, fo, NULL,	\
+	  N_("print list of omitted objects"), PARSE_OPT_NOARG | PARSE_OPT_NONEG, \
+	  opt_parse_filter_print_omitted }
+
+#define OPT_PARSE_FILTER_PRINT_MISSING(fo) \
+	{ OPTION_CALLBACK, 0, CL_ARG_FILTER_PRINT_MISSING, fo, NULL,	\
+	  N_("print list of missing objects"), PARSE_OPT_NOARG | PARSE_OPT_NONEG, \
+	  opt_parse_filter_print_missing }
+
+#define OPT_PARSE_FILTER_RELAX(fo) \
+	{ OPTION_CALLBACK, 0, CL_ARG_FILTER_RELAX, fo, NULL, \
+	  N_("relax consistency checks for previously omitted objects"), \
+	  PARSE_OPT_NOARG | PARSE_OPT_NONEG, opt_parse_filter_relax }
+
+/*
+ * Hand parse known object-filter command line options.
+ * Use this when the caller DOES NOT use the normal OPT_
+ * routines.
+ *
+ * Here we assume args of the form "--<key>" or "--<key>=<value>".
+ * Note the literal dash-dash and equals.
+ *
+ * Returns 1 if we handled the argument.
+ */
+int object_filter_hand_parse_arg(struct object_filter_options *filter_options,
+				 const char *arg,
+				 int allow_print_omitted,
+				 int allow_print_missing,
+				 int allow_relax);
+
+/*
+ * Hand parse known object-filter protocol lines.
+ *
+ * Here we assume args of the form "<key>" or "<key> <value>".
+ * Note the literal space before between the key and value.
+ */ 
+int object_filter_hand_parse_protocol(struct object_filter_options *filter_options,
+				      const char *arg,
+				      int allow_print_omitted,
+				      int allow_print_missing,
+				      int allow_relax);
+
+#endif /* OBJECT_FILTER_H */
-- 
2.9.3


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 08/13] list-objects: add traverse_commit_list_filtered method
  2017-09-22 20:30 [PATCH 05/13] list-objects-filter-large: add large blob filter to list-objects Jeff Hostetler
  2017-09-22 20:30 ` [PATCH 06/13] list-objects-filter-sparse: add sparse-checkout based filter Jeff Hostetler
  2017-09-22 20:30 ` [PATCH 07/13] object-filter: common declarations for object filtering Jeff Hostetler
@ 2017-09-22 20:30 ` Jeff Hostetler
  2017-09-22 20:30 ` [PATCH 09/13] rev-list: add object filtering support Jeff Hostetler
                   ` (4 subsequent siblings)
  7 siblings, 0 replies; 16+ messages in thread
From: Jeff Hostetler @ 2017-09-22 20:30 UTC (permalink / raw)
  To: git; +Cc: gitster, peff, jonathantanmy, Jeff Hostetler

From: Jeff Hostetler <jeffhost@microsoft.com>

Add traverse_commit_list_filtered() wrapper around the various
filter methods using common data in object_filter_options.

Signed-off-by: Jeff Hostetler <jeffhost@microsoft.com>
---
 list-objects.c | 34 ++++++++++++++++++++++++++++++++++
 list-objects.h | 11 +++++++++++
 2 files changed, 45 insertions(+)

diff --git a/list-objects.c b/list-objects.c
index 3e86008..0f063d9 100644
--- a/list-objects.c
+++ b/list-objects.c
@@ -7,6 +7,9 @@
 #include "tree-walk.h"
 #include "revision.h"
 #include "list-objects.h"
+#include "list-objects-filter-all.h"
+#include "list-objects-filter-large.h"
+#include "list-objects-filter-sparse.h"
 
 static void process_blob(struct rev_info *revs,
 			 struct blob *blob,
@@ -266,3 +269,34 @@ void traverse_commit_list(struct rev_info *revs,
 		show_commit, show_object, show_data,
 		NULL, NULL);
 }
+
+void traverse_commit_list_filtered(
+	struct object_filter_options *filter_options,
+	struct rev_info *revs,
+	show_commit_fn show_commit,
+	show_object_fn show_object,
+	oidset2_foreach_cb print_omitted_object,
+	void *show_data)
+{
+	if (filter_options->omit_all_blobs)
+		traverse_commit_list_omit_all_blobs(
+			revs, show_commit, show_object, print_omitted_object, show_data);
+
+	else if (filter_options->omit_large_blobs)
+		traverse_commit_list_omit_large_blobs(
+			revs, show_commit, show_object, print_omitted_object, show_data,
+			(int64_t)(uint64_t)filter_options->large_byte_limit);
+
+	else if (filter_options->use_blob)
+		traverse_commit_list_use_blob(
+			revs, show_commit, show_object, print_omitted_object, show_data,
+			&filter_options->sparse_oid);
+
+	else if (filter_options->use_path)
+		traverse_commit_list_use_path(
+			revs, show_commit, show_object, print_omitted_object, show_data,
+			filter_options->sparse_value);
+
+	else
+		die("unspecified list-objects filter");
+}
diff --git a/list-objects.h b/list-objects.h
index 39fcbb5..a8acedc 100644
--- a/list-objects.h
+++ b/list-objects.h
@@ -1,6 +1,9 @@
 #ifndef LIST_OBJECTS_H
 #define LIST_OBJECTS_H
 
+#include "oidset2.h"
+#include "object-filter.h"
+
 typedef void (*show_commit_fn)(struct commit *, void *);
 typedef void (*show_object_fn)(struct object *, const char *, void *);
 void traverse_commit_list(struct rev_info *, show_commit_fn, show_object_fn, void *);
@@ -38,4 +41,12 @@ void traverse_commit_list_worker(
 	show_commit_fn, show_object_fn, void *show_data,
 	filter_object_fn filter, void *filter_data);
 
+void traverse_commit_list_filtered(
+	struct object_filter_options *filter_options,
+	struct rev_info *revs,
+	show_commit_fn show_commit,
+	show_object_fn show_object,
+	oidset2_foreach_cb print_omitted_object,
+	void *show_data);
+
 #endif
-- 
2.9.3


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 09/13] rev-list: add object filtering support
  2017-09-22 20:30 [PATCH 05/13] list-objects-filter-large: add large blob filter to list-objects Jeff Hostetler
                   ` (2 preceding siblings ...)
  2017-09-22 20:30 ` [PATCH 08/13] list-objects: add traverse_commit_list_filtered method Jeff Hostetler
@ 2017-09-22 20:30 ` Jeff Hostetler
  2017-09-26 22:44   ` Jonathan Tan
  2017-09-22 20:30 ` [PATCH 10/13] rev-list: add filtering help text Jeff Hostetler
                   ` (3 subsequent siblings)
  7 siblings, 1 reply; 16+ messages in thread
From: Jeff Hostetler @ 2017-09-22 20:30 UTC (permalink / raw)
  To: git; +Cc: gitster, peff, jonathantanmy, Jeff Hostetler

From: Jeff Hostetler <jeffhost@microsoft.com>

Teach rev-list to use the filtering provided by the
traverse_commit_list_filtered() interface to omit
unwanted objects from the result.

This feature is only enabled when one of the "--objects*"
options are used.

When the "--filter-print-manifest" option is used, the
omitted objects and their sizes are printed at the end.
These are marked with a "~".  This can be combined with
"--quiet" to get a list of just the omitted objects.

Signed-off-by: Jeff Hostetler <jeffhost@microsoft.com>
---
 builtin/rev-list.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 71 insertions(+), 2 deletions(-)

diff --git a/builtin/rev-list.c b/builtin/rev-list.c
index 95d84d5..5f54495 100644
--- a/builtin/rev-list.c
+++ b/builtin/rev-list.c
@@ -53,6 +53,8 @@ static const char rev_list_usage[] =
 
 static struct progress *progress;
 static unsigned progress_counter;
+static struct object_filter_options filter_options;
+static struct oidset2 missing_objects;
 
 static void finish_commit(struct commit *commit, void *data);
 static void show_commit(struct commit *commit, void *data)
@@ -179,8 +181,25 @@ static void finish_commit(struct commit *commit, void *data)
 static void finish_object(struct object *obj, const char *name, void *cb_data)
 {
 	struct rev_list_info *info = cb_data;
-	if (obj->type == OBJ_BLOB && !has_object_file(&obj->oid))
+	if (obj->type == OBJ_BLOB && !has_object_file(&obj->oid)) {
+		if (filter_options.print_missing) {
+			oidset2_insert(&missing_objects, &obj->oid, obj->type,
+				       -1, name);
+			return;
+		}
+		if (filter_options.relax) {
+			/*
+			 * Relax consistency checks to not complain about
+			 * omitted objects (presumably caused by use of
+			 * the previous use of the 'filter-objects' feature).
+			 *
+			 * Note that this is independent of any filtering that
+			 * we are doing in this run.
+			 */
+			return;
+		}
 		die("missing blob object '%s'", oid_to_hex(&obj->oid));
+	}
 	if (info->revs->verify_objects && !obj->parsed && obj->type != OBJ_COMMIT)
 		parse_object(&obj->oid);
 }
@@ -200,6 +219,25 @@ static void show_edge(struct commit *commit)
 	printf("-%s\n", oid_to_hex(&commit->object.oid));
 }
 
+static void print_omitted_object(int i, int i_limit, struct oidset2_entry *e, void *cb_data)
+{
+	/* struct rev_list_info *info = cb_data; */
+	const char *tn = typename(e->type);
+
+	if (e->object_length == -1)
+		printf("~%s %s\n", oid_to_hex(&e->oid), tn);
+	else
+		printf("~%s %s %"PRIuMAX"\n", oid_to_hex(&e->oid), tn, e->object_length);
+}
+
+static void print_missing_object(int i, int i_limit, struct oidset2_entry *e, void *cb_data)
+{
+	/* struct rev_list_info *info = cb_data; */
+	const char *tn = typename(e->type);
+
+	printf("?%s %s\n", oid_to_hex(&e->oid), tn);
+}
+
 static void print_var_str(const char *var, const char *val)
 {
 	printf("%s='%s'\n", var, val);
@@ -333,6 +371,15 @@ int cmd_rev_list(int argc, const char **argv, const char *prefix)
 			show_progress = arg;
 			continue;
 		}
+		if (object_filter_hand_parse_arg(
+			    &filter_options, arg, 1, 1, 1)) {
+			if (!revs.blob_objects)
+				die(_("object filtering requires --objects"));
+			if (filter_options.use_blob &&
+			    !oidcmp(&filter_options.sparse_oid, &null_oid))
+				die(_("invalid sparse value"));
+			continue;
+		}
 		usage(rev_list_usage);
 
 	}
@@ -357,6 +404,11 @@ int cmd_rev_list(int argc, const char **argv, const char *prefix)
 	if (revs.show_notes)
 		die(_("rev-list does not support display of notes"));
 
+	if (object_filter_enabled(&filter_options)) {
+		if (use_bitmap_index)
+			die(_("cannot combine --use-bitmap-index with object filtering"));
+	}
+
 	save_commit_buffer = (revs.verbose_header ||
 			      revs.grep_filter.pattern_list ||
 			      revs.grep_filter.header_list);
@@ -401,7 +453,24 @@ int cmd_rev_list(int argc, const char **argv, const char *prefix)
 			return show_bisect_vars(&info, reaches, all);
 	}
 
-	traverse_commit_list(&revs, show_commit, show_object, &info);
+	if (filter_options.print_missing)
+		memset(&missing_objects, 0, sizeof(missing_objects));
+
+	if (object_filter_enabled(&filter_options))
+		traverse_commit_list_filtered(
+			&filter_options, &revs,
+			show_commit, show_object,
+			(filter_options.print_omitted
+			 ? print_omitted_object
+			 : NULL),
+			&info);
+	else
+		traverse_commit_list(&revs, show_commit, show_object, &info);
+
+	if (filter_options.print_missing) {
+		oidset2_foreach(&missing_objects, print_missing_object, &info);
+		oidset2_clear(&missing_objects);
+	}
 
 	stop_progress(&progress);
 
-- 
2.9.3


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 10/13] rev-list: add filtering help text
  2017-09-22 20:30 [PATCH 05/13] list-objects-filter-large: add large blob filter to list-objects Jeff Hostetler
                   ` (3 preceding siblings ...)
  2017-09-22 20:30 ` [PATCH 09/13] rev-list: add object filtering support Jeff Hostetler
@ 2017-09-22 20:30 ` Jeff Hostetler
  2017-09-22 20:30 ` [PATCH 11/13] t6112: rev-list object filtering test Jeff Hostetler
                   ` (2 subsequent siblings)
  7 siblings, 0 replies; 16+ messages in thread
From: Jeff Hostetler @ 2017-09-22 20:30 UTC (permalink / raw)
  To: git; +Cc: gitster, peff, jonathantanmy, Jeff Hostetler

From: Jeff Hostetler <jeffhost@microsoft.com>

Signed-off-by: Jeff Hostetler <jeffhost@microsoft.com>
---
 Documentation/git-rev-list.txt     |  9 ++++++++-
 Documentation/rev-list-options.txt | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/Documentation/git-rev-list.txt b/Documentation/git-rev-list.txt
index ef22f17..b2e8255 100644
--- a/Documentation/git-rev-list.txt
+++ b/Documentation/git-rev-list.txt
@@ -47,7 +47,14 @@ SYNOPSIS
 	     [ --fixed-strings | -F ]
 	     [ --date=<format>]
 	     [ [ --objects | --objects-edge | --objects-edge-aggressive ]
-	       [ --unpacked ] ]
+	       [ --unpacked ]
+	       [ [ --filter-omit-all-blobs |
+		   --filter-omit-large-blobs=<n>[kmg] |
+		   --filter-use-blob=<blob-ish> |
+		   --filter-use-path=<path> ]
+		 [ --filter-print-missing ]
+		 [ --filter-print-omitted ] ] ]
+	     [ --filter-relax ]
 	     [ --pretty | --header ]
 	     [ --bisect ]
 	     [ --bisect-vars ]
diff --git a/Documentation/rev-list-options.txt b/Documentation/rev-list-options.txt
index a6cf9eb..5c7d53c 100644
--- a/Documentation/rev-list-options.txt
+++ b/Documentation/rev-list-options.txt
@@ -698,6 +698,38 @@ ifdef::git-rev-list[]
 --unpacked::
 	Only useful with `--objects`; print the object IDs that are not
 	in packs.
+
+--filter-omit-all-blobs::
+	Only useful with one of the `--objects*`; omits all blobs from
+	the printed list of objects.
+
+--filter-omit-large-blobs=<n>[kmg]::
+	Only useful with one of the `--objects*`; omits blobs larger than
+	n bytes from the printed list of objects.  May optionally be
+	followed by 'k', 'm', or 'g' units.  Value may be zero.  Special
+	files (matching ".git*") are always included, regardless of size.
+
+--filter-use-blob=<blob-ish>::
+--filter-use-path=<path>::
+	Only useful with one of the `--objects*`; uses a sparse-checkout
+	specification contained in the given object or file to filter the
+	result to only contain blobs referenced by such a sparse-checkout.
+
+--filter-print-missing::
+	Prints a list of the missing objects for the requested traversal.
+	Object IDs are prefixed with a ``?'' character.  The object type
+	is printed after the ID.  This may be used with or without any of
+	the above filtering options.
+
+--filter-print-omitted::
+	Only useful with one of the above `--filter*`; prints a list
+	of the omitted objects.  Object IDs are prefixed with a ``~''
+	character.  The object size is printed after the ID.
+
+--filter-relax::
+	Relax consistency checking for missing blobs.  Do not warn of
+	missing blobs during normal (non-filtering) object traversal
+	following an earlier partial/narrow clone or fetch.
 endif::git-rev-list[]
 
 --no-walk[=(sorted|unsorted)]::
-- 
2.9.3


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 11/13] t6112: rev-list object filtering test
  2017-09-22 20:30 [PATCH 05/13] list-objects-filter-large: add large blob filter to list-objects Jeff Hostetler
                   ` (4 preceding siblings ...)
  2017-09-22 20:30 ` [PATCH 10/13] rev-list: add filtering help text Jeff Hostetler
@ 2017-09-22 20:30 ` Jeff Hostetler
  2017-09-22 20:30 ` [PATCH 12/13] pack-objects: add object filtering support Jeff Hostetler
  2017-09-22 20:30 ` [PATCH 13/13] pack-objects: add filtering help text Jeff Hostetler
  7 siblings, 0 replies; 16+ messages in thread
From: Jeff Hostetler @ 2017-09-22 20:30 UTC (permalink / raw)
  To: git; +Cc: gitster, peff, jonathantanmy, Jeff Hostetler

From: Jeff Hostetler <jeffhost@microsoft.com>

Signed-off-by: Jeff Hostetler <jeffhost@microsoft.com>
---
 t/t6112-rev-list-filters-objects.sh | 237 ++++++++++++++++++++++++++++++++++++
 1 file changed, 237 insertions(+)
 create mode 100755 t/t6112-rev-list-filters-objects.sh

diff --git a/t/t6112-rev-list-filters-objects.sh b/t/t6112-rev-list-filters-objects.sh
new file mode 100755
index 0000000..66ff022
--- /dev/null
+++ b/t/t6112-rev-list-filters-objects.sh
@@ -0,0 +1,237 @@
+#!/bin/sh
+
+test_description='git rev-list with object filtering'
+
+. ./test-lib.sh
+
+# test the omit-all filter
+
+test_expect_success 'setup' '
+	echo "{print \$1}" >print_1.awk &&
+	echo "{print \$2}" >print_2.awk &&
+
+	for n in 1 2 3 4 5
+	do
+		echo $n > file.$n
+		git add file.$n
+		git commit -m "$n"
+	done
+'
+
+# Verify the omitted ("~OID") lines match the predicted list of OIDs.
+test_expect_success 'omit-all-blobs omitted 5 blobs' '
+	git ls-files -s file.1 file.2 file.3 file.4 file.5 \
+		| awk -f print_2.awk \
+		| sort >expected &&
+	git rev-list HEAD --quiet --objects --filter-print-omitted --filter-omit-all-blobs \
+		| awk -f print_1.awk \
+		| sed "s/~//" >observed &&
+	test_cmp observed expected
+'
+
+# Verify the complete OID list matches the unfiltered OIDs plus the omitted OIDs.
+test_expect_success 'omit-all-blobs nothing else changed' '
+	git rev-list HEAD --objects \
+		| awk -f print_1.awk \
+		| sort >expected &&
+	git rev-list HEAD --objects --filter-print-omitted --filter-omit-all-blobs \
+		| awk -f print_1.awk \
+		| sed "s/~//" \
+		| sort >observed &&
+	test_cmp observed expected
+'
+
+# test the size-based filtering.
+
+test_expect_success 'setup_large' '
+	for n in 1000 10000
+	do
+		printf "%"$n"s" X > large.$n
+		git add large.$n
+		git commit -m "$n"
+	done
+'
+
+test_expect_success 'omit-large-blobs omit 2 blobs' '
+	git ls-files -s large.1000 large.10000 \
+		| awk -f print_2.awk \
+		| sort >expected &&
+	git rev-list HEAD --quiet --objects --filter-print-omitted --filter-omit-large-blobs=500 \
+		| awk -f print_1.awk \
+		| sed "s/~//" >observed &&
+	test_cmp observed expected
+'
+
+test_expect_success 'omit-large-blobs nothing else changed' '
+	git rev-list HEAD --objects \
+		| awk -f print_1.awk \
+		| sort >expected &&
+	git rev-list HEAD --objects --filter-print-omitted --filter-omit-large-blobs=500 \
+		| awk -f print_1.awk \
+		| sed "s/~//" \
+		| sort >observed &&
+	test_cmp observed expected
+'
+
+# boundary test around the size parameter.
+# filter is strictly less than the value, so size 500 and 1000 should have the
+# same results, but 1001 should filter more.
+
+test_expect_success 'omit-large-blobs omit 2 blobs' '
+	git ls-files -s large.1000 large.10000 \
+		| awk -f print_2.awk \
+		| sort >expected &&
+	git rev-list HEAD --quiet --objects --filter-print-omitted --filter-omit-large-blobs=1000 \
+		| awk -f print_1.awk \
+		| sed "s/~//" >observed &&
+	test_cmp observed expected
+'
+
+test_expect_success 'omit-large-blobs omit 1 blob' '
+	git ls-files -s large.10000 \
+		| awk -f print_2.awk \
+		| sort >expected &&
+	git rev-list HEAD --quiet --objects --filter-print-omitted --filter-omit-large-blobs=1001 \
+		| awk -f print_1.awk \
+		| sed "s/~//" >observed &&
+	test_cmp observed expected
+'
+
+test_expect_success 'omit-large-blobs omit 1 blob (1k)' '
+	git ls-files -s large.10000 \
+		| awk -f print_2.awk \
+		| sort >expected &&
+	git rev-list HEAD --quiet --objects --filter-print-omitted --filter-omit-large-blobs=1k \
+		| awk -f print_1.awk \
+		| sed "s/~//" >observed &&
+	test_cmp observed expected
+'
+
+test_expect_success 'omit-large-blobs omit no blob (1m)' '
+	cat </dev/null >expected &&
+	git rev-list HEAD --quiet --objects --filter-print-omitted --filter-omit-large-blobs=1m \
+		| awk -f print_1.awk \
+		| sed "s/~//" >observed &&
+	test_cmp observed expected
+'
+
+# Test sparse-pattern filtering (using explicit local patterns).
+# We use the same disk format as sparse-checkout to specify the
+# filtering, but do not require sparse-checkout to be enabled.
+
+test_expect_success 'setup using sparse file' '
+	mkdir dir1 &&
+	for n in sparse1 sparse2
+	do
+		echo $n > $n
+		git add $n
+		echo dir1/$n > dir1/$n
+		git add dir1/$n
+	done &&
+	git commit -m "sparse" &&
+	echo dir1/ >pattern1 &&
+	echo sparse1 >pattern2
+'
+
+# pattern1 should only include the 2 dir1/* files.
+# and omit the 5 file.*, 2 large.*, and 2 top-level sparse* files.
+test_expect_success 'sparse using path pattern1' '
+	git rev-list HEAD --objects --filter-print-omitted --filter-use-path=pattern1 >out &&
+
+	grep "^~" out >blobs.omitted &&
+	test $(cat blobs.omitted | wc -l) = 9 &&
+
+	grep "dir1/sparse" out >blobs.included &&
+	test $(cat blobs.included | wc -l) = 2
+'
+
+# pattern2 should include the sparse1 and dir1/sparse1.
+# and omit the 5 file.*, 2 large.*, and the 2 sparse2 files.
+test_expect_success 'sparse using path pattern2' '
+	git rev-list HEAD --objects --filter-print-omitted --filter-use-path=pattern2 >out &&
+
+	grep "^~" out >blobs.omitted &&
+	test $(cat blobs.omitted | wc -l) = 9 &&
+
+	grep "sparse1" out >blobs.included &&
+	test $(cat blobs.included | wc -l) = 2
+'
+
+# Test sparse-pattern filtering (using a blob in the repo).
+# This could be used to later let pack-objects do filtering.
+
+# pattern1 should only include the 2 dir1/* files.
+# and omit the 5 file.*, 2 large.*, 2 top-level sparse*, and 1 pattern file.
+test_expect_success 'sparse using OID for pattern1' '
+	git add pattern1 &&
+	git commit -m "pattern1" &&
+
+	git rev-list HEAD --objects >normal.output &&
+	grep "pattern1" <normal.output | awk "{print \$1;}" >pattern1.oid &&
+
+	git rev-list HEAD --objects --filter-print-omitted --filter-use-blob=`cat pattern1.oid` >out &&
+
+	grep "^~" out >blobs.omitted &&
+	test $(cat blobs.omitted | wc -l) = 10 &&
+
+	grep "dir1/sparse" out >blobs.included &&
+	test $(cat blobs.included | wc -l) = 2
+'
+
+# repeat previous test but use blob-ish expression rather than OID.
+test_expect_success 'sparse using blob-ish to get OID for pattern spec' '
+	git rev-list HEAD --objects --filter-print-omitted --filter-use-blob=HEAD:pattern1 >out &&
+
+	grep "^~" out >blobs.omitted &&
+	test $(cat blobs.omitted | wc -l) = 10 &&
+
+	grep "dir1/sparse" out >blobs.included &&
+	test $(cat blobs.included | wc -l) = 2
+'
+
+# pattern2 should include the sparse1 and dir1/sparse1.
+# and omit the 5 file.*, 2 large.*, 2 top-level sparse*, and 2 pattern files.
+test_expect_success 'sparse using OID for pattern2' '
+	git add pattern2 &&
+	git commit -m "pattern2" &&
+
+	git rev-list HEAD --objects >normal.output &&
+	grep "pattern2" <normal.output | awk "{print \$1;}" >pattern2.oid &&
+
+	git rev-list HEAD --objects --filter-print-omitted --filter-use-blob=`cat pattern2.oid` >out &&
+
+	grep "^~" out >blobs.omitted &&
+	test $(cat blobs.omitted | wc -l) = 11 &&
+
+	grep "sparse1" out >blobs.included &&
+	test $(cat blobs.included | wc -l) = 2
+'
+
+# repeat previous test but use blob-ish expression rather than OID.
+test_expect_success 'sparse using blob-ish rather than OID for pattern2' '
+	git rev-list HEAD --objects --filter-print-omitted --filter-use-blob=HEAD:pattern2 >out &&
+
+	grep "^~" out >blobs.omitted &&
+	test $(cat blobs.omitted | wc -l) = 11 &&
+
+	grep "sparse1" out >blobs.included &&
+	test $(cat blobs.included | wc -l) = 2
+'
+
+# delete some loose objects and test rev-list printing them as missing.
+test_expect_success 'print missing objects' '
+	git ls-files -s file.1 file.2 file.3 file.4 file.5 \
+		| awk -f print_2.awk \
+		| sort >expected &&
+	for id in `cat expected | sed "s|..|&/|"`
+	do
+		rm .git/objects/$id
+	done &&
+	git rev-list --quiet HEAD --filter-print-missing --objects \
+		| awk -f print_1.awk \
+		| sed "s/?//" \
+		| sort >observed &&
+	test_cmp observed expected
+'
+
+test_done
-- 
2.9.3


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 12/13] pack-objects: add object filtering support
  2017-09-22 20:30 [PATCH 05/13] list-objects-filter-large: add large blob filter to list-objects Jeff Hostetler
                   ` (5 preceding siblings ...)
  2017-09-22 20:30 ` [PATCH 11/13] t6112: rev-list object filtering test Jeff Hostetler
@ 2017-09-22 20:30 ` Jeff Hostetler
  2017-09-22 20:30 ` [PATCH 13/13] pack-objects: add filtering help text Jeff Hostetler
  7 siblings, 0 replies; 16+ messages in thread
From: Jeff Hostetler @ 2017-09-22 20:30 UTC (permalink / raw)
  To: git; +Cc: gitster, peff, jonathantanmy, Jeff Hostetler

From: Jeff Hostetler <jeffhost@microsoft.com>

Teach pack-objects to use the filtering provided by the
traverse_commit_list_filtered() interface to omit unwanted
objects from the resulting packfile.

This feature is intended for partial clone/fetch.

Filtering requires the use of the "--stdout" option.

Signed-off-by: Jeff Hostetler <jeffhost@microsoft.com>
---
 builtin/pack-objects.c | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index f4a8441..1712d1b 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -78,6 +78,8 @@ static unsigned long cache_max_small_delta_size = 1000;
 
 static unsigned long window_memory_limit = 0;
 
+static struct object_filter_options filter_options;
+
 /*
  * stats
  */
@@ -2810,7 +2812,12 @@ static void get_object_list(int ac, const char **av)
 	if (prepare_revision_walk(&revs))
 		die("revision walk setup failed");
 	mark_edges_uninteresting(&revs, show_edge);
-	traverse_commit_list(&revs, show_commit, show_object, NULL);
+	if (object_filter_enabled(&filter_options))
+		traverse_commit_list_filtered(&filter_options, &revs,
+					      show_commit, show_object,
+					      NULL, NULL);
+	else
+		traverse_commit_list(&revs, show_commit, show_object, NULL);
 
 	if (unpack_unreachable_expiration) {
 		revs.ignore_missing_links = 1;
@@ -2946,6 +2953,15 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 			 N_("use a bitmap index if available to speed up counting objects")),
 		OPT_BOOL(0, "write-bitmap-index", &write_bitmap_index,
 			 N_("write a bitmap index together with the pack index")),
+
+		OPT_PARSE_FILTER_OMIT_ALL_BLOBS(&filter_options),
+		OPT_PARSE_FILTER_OMIT_LARGE_BLOBS(&filter_options),
+		OPT_PARSE_FILTER_USE_BLOB(&filter_options),
+		OPT_PARSE_FILTER_USE_PATH(&filter_options),
+		/* not needed: OPT_PARSE_FILTER_PRINT_MISSING */
+		/* not needed: OPT_PARSE_FILTER_PRINT_OMITTED */
+		/* not needed: OPT_PARSE_FILTER_RELAX */
+
 		OPT_END(),
 	};
 
@@ -3022,6 +3038,12 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 	if (!rev_list_all || !rev_list_reflog || !rev_list_index)
 		unpack_unreachable_expiration = 0;
 
+	if (object_filter_enabled(&filter_options)) {
+		if (!pack_to_stdout)
+			die("cannot use filtering with an indexable pack.");
+		use_bitmap_index = 0;
+	}
+
 	/*
 	 * "soft" reasons not to use bitmaps - for on-disk repack by default we want
 	 *
-- 
2.9.3


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 13/13] pack-objects: add filtering help text
  2017-09-22 20:30 [PATCH 05/13] list-objects-filter-large: add large blob filter to list-objects Jeff Hostetler
                   ` (6 preceding siblings ...)
  2017-09-22 20:30 ` [PATCH 12/13] pack-objects: add object filtering support Jeff Hostetler
@ 2017-09-22 20:30 ` Jeff Hostetler
  7 siblings, 0 replies; 16+ messages in thread
From: Jeff Hostetler @ 2017-09-22 20:30 UTC (permalink / raw)
  To: git; +Cc: gitster, peff, jonathantanmy, Jeff Hostetler

From: Jeff Hostetler <jeffhost@microsoft.com>

Add help text for new object filtering options to
pack-objects documentation.

Signed-off-by: Jeff Hostetler <jeffhost@microsoft.com>
---
 Documentation/git-pack-objects.txt | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/Documentation/git-pack-objects.txt b/Documentation/git-pack-objects.txt
index 8973510..1ed7d4b 100644
--- a/Documentation/git-pack-objects.txt
+++ b/Documentation/git-pack-objects.txt
@@ -231,6 +231,23 @@ So does `git bundle` (see linkgit:git-bundle[1]) when it creates a bundle.
 	With this option, parents that are hidden by grafts are packed
 	nevertheless.
 
+--filter-omit-all-blobs::
+	Requires `--stdout`.  Omits all blobs from the packfile.
+
+--filter-omit-large-blobs=<n>[kmg]::
+	Requires `--stdout`.  Omits large blobs larger than n bytes from
+	the packfile.  May optionally be followed by 'k', 'm', or 'g' units.
+	Value may be zero.  Special files (matching ".git*") are always
+	included, regardless of size.
+
+--filter-use-blob=<blob-ish>::
+--filter-use-path=<path>::
+	Requires `--stdout`.  Use a sparse-checkout specification to
+	filter the resulting packfile to only contain the blobs that
+	would be referenced by such a sparse-checkout.  `<path>` specifies
+	a local pathname.  `<blob-ish>` specifies an expression that can
+	be evaluated to a blob.
+
 SEE ALSO
 --------
 linkgit:git-rev-list[1]
-- 
2.9.3


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH 07/13] object-filter: common declarations for object filtering
  2017-09-22 20:30 ` [PATCH 07/13] object-filter: common declarations for object filtering Jeff Hostetler
@ 2017-09-26 22:39   ` Jonathan Tan
  2017-09-27 17:09     ` Jeff Hostetler
  0 siblings, 1 reply; 16+ messages in thread
From: Jonathan Tan @ 2017-09-26 22:39 UTC (permalink / raw)
  To: Jeff Hostetler; +Cc: git, gitster, peff, Jeff Hostetler

On Fri, 22 Sep 2017 20:30:11 +0000
Jeff Hostetler <git@jeffhostetler.com> wrote:

>  Makefile        |   1 +
>  object-filter.c | 269 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  object-filter.h | 173 ++++++++++++++++++++++++++++++++++++
>  3 files changed, 443 insertions(+)
>  create mode 100644 object-filter.c
>  create mode 100644 object-filter.h

I think these and list-objects-filter-* are multiple levels of
indirection too many. Would a single file with a few implementations of
filter_object_fn be sufficient?

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 09/13] rev-list: add object filtering support
  2017-09-22 20:30 ` [PATCH 09/13] rev-list: add object filtering support Jeff Hostetler
@ 2017-09-26 22:44   ` Jonathan Tan
  2017-09-27 17:26     ` Jeff Hostetler
  0 siblings, 1 reply; 16+ messages in thread
From: Jonathan Tan @ 2017-09-26 22:44 UTC (permalink / raw)
  To: Jeff Hostetler; +Cc: git, gitster, peff, Jeff Hostetler

On Fri, 22 Sep 2017 20:30:13 +0000
Jeff Hostetler <git@jeffhostetler.com> wrote:

> +		if (filter_options.relax) {

Add some documentation about how this differs from ignore_missing_links
in struct rev_info.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 07/13] object-filter: common declarations for object filtering
  2017-09-26 22:39   ` Jonathan Tan
@ 2017-09-27 17:09     ` Jeff Hostetler
  2017-09-28  0:05       ` Jonathan Tan
  0 siblings, 1 reply; 16+ messages in thread
From: Jeff Hostetler @ 2017-09-27 17:09 UTC (permalink / raw)
  To: Jonathan Tan; +Cc: git, gitster, peff, Jeff Hostetler



On 9/26/2017 6:39 PM, Jonathan Tan wrote:
> On Fri, 22 Sep 2017 20:30:11 +0000
> Jeff Hostetler <git@jeffhostetler.com> wrote:
> 
>>   Makefile        |   1 +
>>   object-filter.c | 269 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>   object-filter.h | 173 ++++++++++++++++++++++++++++++++++++
>>   3 files changed, 443 insertions(+)
>>   create mode 100644 object-filter.c
>>   create mode 100644 object-filter.h
> 
> I think these and list-objects-filter-* are multiple levels of
> indirection too many. Would a single file with a few implementations of
> filter_object_fn be sufficient?

I did that in my first draft and I found it confusing.

Each filter has 3 parts (some filter-specific data structures,
a filter callback routine, a driver to call the traverse code).
I found it easier to reason about each filter in isolation.
And it makes it easier to work on each independently and keep
their inclusion in separate commits.


  

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 09/13] rev-list: add object filtering support
  2017-09-26 22:44   ` Jonathan Tan
@ 2017-09-27 17:26     ` Jeff Hostetler
  0 siblings, 0 replies; 16+ messages in thread
From: Jeff Hostetler @ 2017-09-27 17:26 UTC (permalink / raw)
  To: Jonathan Tan; +Cc: git, gitster, peff, Jeff Hostetler



On 9/26/2017 6:44 PM, Jonathan Tan wrote:
> On Fri, 22 Sep 2017 20:30:13 +0000
> Jeff Hostetler <git@jeffhostetler.com> wrote:
> 
>> +		if (filter_options.relax) {
> 
> Add some documentation about how this differs from ignore_missing_links
> in struct rev_info.
> 

It's unclear what that field means.  (It's not documented that I saw.)
And it is explicitly turned on and off in pack-bitmap.c, so again I'm
not sure how that agrees with what I'm doing.

My relax field is probably an interim thing (to allow rev-list, index-pack,
and friends not complain about missing objects during my development).

I suspect that you and I will need to merge our efforts here.  You have
a similar variable "revs->exclude_promisor_objects".  But I need to study
how our usages differ.

Thanks,
Jeff

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 07/13] object-filter: common declarations for object filtering
  2017-09-27 17:09     ` Jeff Hostetler
@ 2017-09-28  0:05       ` Jonathan Tan
  2017-09-28 14:33         ` Jeff Hostetler
  0 siblings, 1 reply; 16+ messages in thread
From: Jonathan Tan @ 2017-09-28  0:05 UTC (permalink / raw)
  To: Jeff Hostetler; +Cc: git, gitster, peff, Jeff Hostetler

On Wed, 27 Sep 2017 13:09:42 -0400
Jeff Hostetler <git@jeffhostetler.com> wrote:

> On 9/26/2017 6:39 PM, Jonathan Tan wrote:
> > On Fri, 22 Sep 2017 20:30:11 +0000
> > Jeff Hostetler <git@jeffhostetler.com> wrote:
> > 
> >>   Makefile        |   1 +
> >>   object-filter.c | 269 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> >>   object-filter.h | 173 ++++++++++++++++++++++++++++++++++++
> >>   3 files changed, 443 insertions(+)
> >>   create mode 100644 object-filter.c
> >>   create mode 100644 object-filter.h
> > 
> > I think these and list-objects-filter-* are multiple levels of
> > indirection too many. Would a single file with a few implementations of
> > filter_object_fn be sufficient?
> 
> I did that in my first draft and I found it confusing.
> 
> Each filter has 3 parts (some filter-specific data structures,
> a filter callback routine, a driver to call the traverse code).
> I found it easier to reason about each filter in isolation.
> And it makes it easier to work on each independently and keep
> their inclusion in separate commits.

I looked at object-filter.{c,h} a bit more. It seems that these files:
 1) define a struct that contains all the options that we would want
 2) supplies a way to populate this struct from code that uses parse-options
 3) supplies a way to populate this struct from code that calculates
    options by hand
 4) supplies a way to populate this struct from "protocol" ("<key>" or
    "<key> <value>" strings)

And the next commit takes the struct that object-filter.{c,h} produces
and actually performs the traversal.

I think this can be significantly simplified, though. Would this work:
 a) Define the object_filter_options struct, but make all fields
    correspond to a single parameter each. Define
    OBJECT_FILTER_OPTIONS_INIT to initialize everything to 0 except for
    large_byte_limit to ULONG_MAX (so that we can detect if something
    else is set to it).
 b) Define one single OPT_PARSE_FILTER macro containing all the
    parameters. We can use the non-callback macros here. That solves 2)
    above.
 c) Define a function that takes in (int *argc, char ***argv) that can
    "massage" it to remove all filter-related arguments, storing them in
    a object_filter_options struct. That solves 3) above. As discussed
    in the API documentation, this means that argument lists of the form
    "--unknown --known" (where "--unknown" takes an argument) are
    processed differently, but then again, rev-list never supported them
    anyway (it required "--unknown=<arg>").
 d) Define a function that converts "<key>" into "--<key>" and "<key>
    <value>" into "--<key>=<value>", and use the existing mechanism.
    That solves 4) above.

This removes the need to maintain the lists of one-per-argument
functions, including the parse_filter_* and opt_parse_filter_* functions
declared in the header file. If we were to add a feature, we wouldn't
need to change anything in the caller, and wouldn't need to hand-edit
object_filter_hand_parse_arg() and object_filter_hand_parse_protocol().

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 07/13] object-filter: common declarations for object filtering
  2017-09-28  0:05       ` Jonathan Tan
@ 2017-09-28 14:33         ` Jeff Hostetler
  2017-09-29 19:47           ` Jonathan Tan
  0 siblings, 1 reply; 16+ messages in thread
From: Jeff Hostetler @ 2017-09-28 14:33 UTC (permalink / raw)
  To: Jonathan Tan; +Cc: git, gitster, peff, Jeff Hostetler



On 9/27/2017 8:05 PM, Jonathan Tan wrote:
> On Wed, 27 Sep 2017 13:09:42 -0400
> Jeff Hostetler <git@jeffhostetler.com> wrote:
> 
>> On 9/26/2017 6:39 PM, Jonathan Tan wrote:
>>> On Fri, 22 Sep 2017 20:30:11 +0000
>>> Jeff Hostetler <git@jeffhostetler.com> wrote:
>>>
>>>>    Makefile        |   1 +
>>>>    object-filter.c | 269 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>>>    object-filter.h | 173 ++++++++++++++++++++++++++++++++++++
>>>>    3 files changed, 443 insertions(+)
>>>>    create mode 100644 object-filter.c
>>>>    create mode 100644 object-filter.h
>>>
>>> I think these and list-objects-filter-* are multiple levels of
>>> indirection too many. Would a single file with a few implementations of
>>> filter_object_fn be sufficient?
>>
>> I did that in my first draft and I found it confusing.
>>
>> Each filter has 3 parts (some filter-specific data structures,
>> a filter callback routine, a driver to call the traverse code).
>> I found it easier to reason about each filter in isolation.
>> And it makes it easier to work on each independently and keep
>> their inclusion in separate commits.
> 
> I looked at object-filter.{c,h} a bit more. It seems that these files:
>   1) define a struct that contains all the options that we would want
>   2) supplies a way to populate this struct from code that uses parse-options
>   3) supplies a way to populate this struct from code that calculates
>      options by hand
>   4) supplies a way to populate this struct from "protocol" ("<key>" or
>      "<key> <value>" strings)
> 
> And the next commit takes the struct that object-filter.{c,h} produces
> and actually performs the traversal.
> 
> I think this can be significantly simplified, though. Would this work:
>   a) Define the object_filter_options struct, but make all fields
>      correspond to a single parameter each. Define
>      OBJECT_FILTER_OPTIONS_INIT to initialize everything to 0 except for
>      large_byte_limit to ULONG_MAX (so that we can detect if something
>      else is set to it).
>   b) Define one single OPT_PARSE_FILTER macro containing all the
>      parameters. We can use the non-callback macros here. That solves 2)
>      above.
>   c) Define a function that takes in (int *argc, char ***argv) that can
>      "massage" it to remove all filter-related arguments, storing them in
>      a object_filter_options struct. That solves 3) above. As discussed
>      in the API documentation, this means that argument lists of the form
>      "--unknown --known" (where "--unknown" takes an argument) are
>      processed differently, but then again, rev-list never supported them
>      anyway (it required "--unknown=<arg>").
>   d) Define a function that converts "<key>" into "--<key>" and "<key>
>      <value>" into "--<key>=<value>", and use the existing mechanism.
>      That solves 4) above.
> 
> This removes the need to maintain the lists of one-per-argument
> functions, including the parse_filter_* and opt_parse_filter_* functions
> declared in the header file. If we were to add a feature, we wouldn't
> need to change anything in the caller, and wouldn't need to hand-edit
> object_filter_hand_parse_arg() and object_filter_hand_parse_protocol().

Maybe.  What I have here now is the result of adding these arguments to
rev-list and pack-objects (in the current patch series), and also to
fetch-pack, fetch, clone, upload-pack, index-pack, and the transport and
protocol code (in a follow-on patch series that I've omitted for the moment).
And there will probably be a few more, such as fsck, gc, and etc.  I hesitate
to refine the macros too much further until we've agreement on the overall
approach and terms.

Thanks
Jeff



^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 07/13] object-filter: common declarations for object filtering
  2017-09-28 14:33         ` Jeff Hostetler
@ 2017-09-29 19:47           ` Jonathan Tan
  0 siblings, 0 replies; 16+ messages in thread
From: Jonathan Tan @ 2017-09-29 19:47 UTC (permalink / raw)
  To: Jeff Hostetler; +Cc: git, gitster, peff, Jeff Hostetler

On Thu, 28 Sep 2017 10:33:39 -0400
Jeff Hostetler <git@jeffhostetler.com> wrote:

> Maybe.  What I have here now is the result of adding these arguments to
> rev-list and pack-objects (in the current patch series), and also to
> fetch-pack, fetch, clone, upload-pack, index-pack, and the transport and
> protocol code (in a follow-on patch series that I've omitted for the moment).
> And there will probably be a few more, such as fsck, gc, and etc.  I hesitate
> to refine the macros too much further until we've agreement on the overall
> approach and terms.

Fair enough. My current opinion on the overall approach (others might
differ, of course):
 - Filtering based on a sparse checkout specification in rev-list and
   pack-objects sounds useful to me, and is worth the filtering
   mechanism.
 - Filtering based on size (or based on type) still doesn't seem useful
   to me in rev-list, but if we're going to implement the filtering
   mechanism anyway, we might as well use the mechanism.
 - Besides my comments in [1], I think the API could still be slightly
   better organized. For example, object-filter probably should be the
   one to define the traverse_ function that takes in struct
   object_filter_options, and optionally a set of excluded objects to
   populate.

[1] https://public-inbox.org/git/20170927170533.65498396e008fa148a3fda90@google.com/

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2017-09-29 19:47 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-09-22 20:30 [PATCH 05/13] list-objects-filter-large: add large blob filter to list-objects Jeff Hostetler
2017-09-22 20:30 ` [PATCH 06/13] list-objects-filter-sparse: add sparse-checkout based filter Jeff Hostetler
2017-09-22 20:30 ` [PATCH 07/13] object-filter: common declarations for object filtering Jeff Hostetler
2017-09-26 22:39   ` Jonathan Tan
2017-09-27 17:09     ` Jeff Hostetler
2017-09-28  0:05       ` Jonathan Tan
2017-09-28 14:33         ` Jeff Hostetler
2017-09-29 19:47           ` Jonathan Tan
2017-09-22 20:30 ` [PATCH 08/13] list-objects: add traverse_commit_list_filtered method Jeff Hostetler
2017-09-22 20:30 ` [PATCH 09/13] rev-list: add object filtering support Jeff Hostetler
2017-09-26 22:44   ` Jonathan Tan
2017-09-27 17:26     ` Jeff Hostetler
2017-09-22 20:30 ` [PATCH 10/13] rev-list: add filtering help text Jeff Hostetler
2017-09-22 20:30 ` [PATCH 11/13] t6112: rev-list object filtering test Jeff Hostetler
2017-09-22 20:30 ` [PATCH 12/13] pack-objects: add object filtering support Jeff Hostetler
2017-09-22 20:30 ` [PATCH 13/13] pack-objects: add filtering help text Jeff Hostetler

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.