Skip to content

Commit a6bdfa4

Browse files
jeffhostetlerdscho
authored andcommitted
survey: add vector of largest objects for various scaling dimensions
Create `struct large_item` and `struct large_item_vec` to capture the n largest commits, trees, and blobs under various scaling dimensions, such as size in bytes, number of commit parents, or number of entries in a tree. Each of these have a command line option to set them independently. Signed-off-by: Jeff Hostetler <[email protected]>
1 parent 3bd3d96 commit a6bdfa4

File tree

3 files changed

+276
-6
lines changed

3 files changed

+276
-6
lines changed

Documentation/config/survey.txt

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,4 +11,33 @@ survey.*::
1111
top::
1212
This integer value implies `--top=<N>`, specifying the
1313
number of entries in the detail tables.
14+
showBlobSizes::
15+
A non-negative integer value. Requests details on the
16+
<n> largest file blobs by size in bytes. Provides a
17+
default value for `--blob-sizes=<n>` in
18+
linkgit:git-survey[1].
19+
showCommitParents::
20+
A non-negative integer value. Requests details on the
21+
<n> commits with the most number of parents. Provides a
22+
default value for `--commit-parents=<n>` in
23+
linkgit:git-survey[1].
24+
showCommitSizes::
25+
A non-negative integer value. Requests details on the
26+
<n> largest commits by size in bytes. Generally, these
27+
are the commits with the largest commit messages.
28+
Provides a default value for `--commit-sizes=<n>` in
29+
linkgit:git-survey[1].
30+
showTreeEntries::
31+
A non-negative integer value. Requests details on the
32+
<n> trees (directories) with the most number of entries
33+
(files and subdirectories). Provides a default value
34+
for `--tree-entries=<n>` in linkgit:git-survey[1].
35+
showTreeSizes::
36+
A non-negative integer value. Requests details on the
37+
<n> largest trees (directories) by size in bytes. This
38+
will set will usually be equal to the
39+
`survey.showTreeEntries` set, but may be skewed by very
40+
long file or subdirectory entry names. Provides a
41+
default value for `--tree-sizes=<n>` in
42+
linkgit:git-survey[1].
1443
--

Documentation/git-survey.txt

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,32 @@ only refs for the given options are added.
5959
--other::
6060
Add notes (`refs/notes/`) and stashes (`refs/stash/`) to the set.
6161

62+
Large Item Selection
63+
~~~~~~~~~~~~~~~~~~~~
64+
65+
The following options control the optional display of large items under
66+
various dimensions of scale. The OID of the largest `n` objects will be
67+
displayed in reverse sorted order. For each, `n` defaults to 10.
68+
69+
--commit-parents::
70+
Shows the OIDs of the commits with the most parent commits.
71+
72+
--commit-sizes::
73+
Shows the OIDs of the largest commits by size in bytes. This is
74+
usually the ones with the largest commit messages.
75+
76+
--tree-entries::
77+
Shows the OIDs of the trees with the most number of entries. These
78+
are the directories with the most number of files or subdirectories.
79+
80+
--tree-sizes::
81+
Shows the OIDs of the largest trees by size in bytes. This set
82+
will usually be the same as the vector of number of entries unless
83+
skewed by very long entry names.
84+
85+
--blob-sizes::
86+
Shows the OIDs of the largest blobs by size in bytes.
87+
6288
OUTPUT
6389
------
6490

@@ -78,6 +104,11 @@ Reachable Object Summary
78104
The reachable object summary shows the total number of each kind of Git
79105
object, including tags, commits, trees, and blobs.
80106

107+
CONFIGURATION
108+
-------------
109+
110+
include::config/survey.txt[]
111+
81112
GIT
82113
---
83114
Part of the linkgit:git[1] suite

builtin/survey.c

Lines changed: 216 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,15 @@ static struct survey_refs_wanted default_ref_options = {
4141
struct survey_opts {
4242
int verbose;
4343
int show_progress;
44+
45+
int show_largest_commits_by_nr_parents;
46+
int show_largest_commits_by_size_bytes;
47+
48+
int show_largest_trees_by_nr_entries;
49+
int show_largest_trees_by_size_bytes;
50+
51+
int show_largest_blobs_by_size_bytes;
52+
4453
int top_nr;
4554
struct survey_refs_wanted refs;
4655
};
@@ -138,6 +147,87 @@ static void incr_obj_hist_bin(struct obj_hist_bin *pbin,
138147
pbin->cnt_seen++;
139148
}
140149

150+
/*
151+
* Remember the largest n objects for some scaling dimension. This
152+
* could be the observed object size or number of entries in a tree.
153+
* We'll use this to generate a sorted vector in the output for that
154+
* dimension.
155+
*/
156+
struct large_item {
157+
uint64_t size;
158+
struct object_id oid;
159+
};
160+
161+
struct large_item_vec {
162+
char *dimension_label;
163+
char *item_label;
164+
uint64_t nr_items;
165+
struct large_item items[FLEX_ARRAY]; /* nr_items */
166+
};
167+
168+
static struct large_item_vec *alloc_large_item_vec(const char *dimension_label,
169+
const char *item_label,
170+
uint64_t nr_items)
171+
{
172+
struct large_item_vec *vec;
173+
size_t flex_len = nr_items * sizeof(struct large_item);
174+
175+
if (!nr_items)
176+
return NULL;
177+
178+
vec = xcalloc(1, (sizeof(struct large_item_vec) + flex_len));
179+
vec->dimension_label = strdup(dimension_label);
180+
vec->item_label = strdup(item_label);
181+
vec->nr_items = nr_items;
182+
183+
return vec;
184+
}
185+
186+
static void free_large_item_vec(struct large_item_vec *vec)
187+
{
188+
if (!vec)
189+
return;
190+
191+
free(vec->dimension_label);
192+
free(vec->item_label);
193+
free(vec);
194+
}
195+
196+
static void maybe_insert_large_item(struct large_item_vec *vec,
197+
uint64_t size,
198+
struct object_id *oid)
199+
{
200+
size_t rest_len;
201+
size_t k;
202+
203+
if (!vec || !vec->nr_items)
204+
return;
205+
206+
/*
207+
* Since the odds an object being among the largest n
208+
* is small, shortcut and see if it is smaller than
209+
* the smallest one in our set and quickly reject it.
210+
*/
211+
if (size < vec->items[vec->nr_items - 1].size)
212+
return;
213+
214+
for (k = 0; k < vec->nr_items; k++) {
215+
if (size < vec->items[k].size)
216+
continue;
217+
218+
/* push items[k..] down one and insert it here */
219+
220+
rest_len = (vec->nr_items - k - 1) * sizeof(struct large_item);
221+
if (rest_len)
222+
memmove(&vec->items[k + 1], &vec->items[k], rest_len);
223+
224+
memset(&vec->items[k], 0, sizeof(struct large_item));
225+
vec->items[k].size = size;
226+
oidcpy(&vec->items[k].oid, oid);
227+
return;
228+
}
229+
}
230+
141231
/*
142232
* Common fields for any type of object.
143233
*/
@@ -183,6 +273,9 @@ struct survey_stats_commits {
183273
* Count of commits with k parents.
184274
*/
185275
uint32_t parent_cnt_pbin[PBIN_VEC_LEN];
276+
277+
struct large_item_vec *vec_largest_by_nr_parents;
278+
struct large_item_vec *vec_largest_by_size_bytes;
186279
};
187280

188281
/*
@@ -192,11 +285,18 @@ struct survey_stats_trees {
192285
struct survey_stats_base_object base;
193286

194287
/*
195-
* In the following, nr_entries refers to the number of files or
196-
* subdirectories in a tree. We are interested in how wide the
197-
* tree is and if the repo has gigantic directories.
288+
* Keep a vector of the trees with the most number of entries.
289+
* This gives us a feel for the width of a tree when there are
290+
* gigantic directories.
198291
*/
199-
uint64_t max_entries; /* max(nr_entries) -- the width of the largest tree */
292+
struct large_item_vec *vec_largest_by_nr_entries;
293+
294+
/*
295+
* Keep a vector of the trees with the largest size in bytes.
296+
* The contents of this may or may not match items in the other
297+
* vector, since entryname length can alter the results.
298+
*/
299+
struct large_item_vec *vec_largest_by_size_bytes;
200300

201301
/*
202302
* Computing the sum of the number of entries across all trees
@@ -216,6 +316,11 @@ struct survey_stats_trees {
216316
*/
217317
struct survey_stats_blobs {
218318
struct survey_stats_base_object base;
319+
320+
/*
321+
* Remember the OIDs of the largest n blobs.
322+
*/
323+
struct large_item_vec *vec_largest_by_size_bytes;
219324
};
220325

221326
struct survey_report_object_summary {
@@ -396,6 +501,12 @@ struct survey_context {
396501

397502
static void clear_survey_context(struct survey_context *ctx)
398503
{
504+
free_large_item_vec(ctx->report.reachable_objects.commits.vec_largest_by_nr_parents);
505+
free_large_item_vec(ctx->report.reachable_objects.commits.vec_largest_by_size_bytes);
506+
free_large_item_vec(ctx->report.reachable_objects.trees.vec_largest_by_nr_entries);
507+
free_large_item_vec(ctx->report.reachable_objects.trees.vec_largest_by_size_bytes);
508+
free_large_item_vec(ctx->report.reachable_objects.blobs.vec_largest_by_size_bytes);
509+
399510
ref_array_clear(&ctx->ref_array);
400511
strvec_clear(&ctx->refs);
401512
}
@@ -608,6 +719,32 @@ static void survey_report_commit_parents(struct survey_context *ctx)
608719
clear_table(&table);
609720
}
610721

722+
static void survey_report_largest_vec(struct large_item_vec *vec)
723+
{
724+
struct survey_table table = SURVEY_TABLE_INIT;
725+
struct strbuf size = STRBUF_INIT;
726+
727+
if (!vec || !vec->nr_items)
728+
return;
729+
730+
table.table_name = vec->dimension_label;
731+
strvec_pushl(&table.header, "Size", "OID", NULL);
732+
733+
for (size_t k = 0; k < vec->nr_items; k++) {
734+
struct large_item *pk = &vec->items[k];
735+
if (!is_null_oid(&pk->oid)) {
736+
strbuf_reset(&size);
737+
strbuf_addf(&size, "%"PRIuMAX, (uintmax_t)pk->size);
738+
739+
insert_table_rowv(&table, size.buf, oid_to_hex(&pk->oid), NULL);
740+
}
741+
}
742+
strbuf_release(&size);
743+
744+
print_table_plaintext(&table);
745+
clear_table(&table);
746+
}
747+
611748
static void survey_report_plaintext_refs(struct survey_context *ctx)
612749
{
613750
struct survey_report_ref_summary *refs = &ctx->report.refs;
@@ -787,6 +924,12 @@ static void survey_report_plaintext(struct survey_context *ctx)
787924
&ctx->report.top_paths_by_inflate[REPORT_TYPE_TREE]);
788925
survey_report_plaintext_sorted_size(
789926
&ctx->report.top_paths_by_inflate[REPORT_TYPE_BLOB]);
927+
928+
survey_report_largest_vec(ctx->report.reachable_objects.commits.vec_largest_by_nr_parents);
929+
survey_report_largest_vec(ctx->report.reachable_objects.commits.vec_largest_by_size_bytes);
930+
survey_report_largest_vec(ctx->report.reachable_objects.trees.vec_largest_by_nr_entries);
931+
survey_report_largest_vec(ctx->report.reachable_objects.trees.vec_largest_by_size_bytes);
932+
survey_report_largest_vec(ctx->report.reachable_objects.blobs.vec_largest_by_size_bytes);
790933
}
791934

792935
/*
@@ -858,6 +1001,27 @@ static int survey_load_config_cb(const char *var, const char *value,
8581001
ctx->opts.show_progress = git_config_bool(var, value);
8591002
return 0;
8601003
}
1004+
if (!strcmp(var, "survey.showcommitparents")) {
1005+
ctx->opts.show_largest_commits_by_nr_parents = git_config_ulong(var, value, cctx->kvi);
1006+
return 0;
1007+
}
1008+
if (!strcmp(var, "survey.showcommitsizes")) {
1009+
ctx->opts.show_largest_commits_by_size_bytes = git_config_ulong(var, value, cctx->kvi);
1010+
return 0;
1011+
}
1012+
1013+
if (!strcmp(var, "survey.showtreeentries")) {
1014+
ctx->opts.show_largest_trees_by_nr_entries = git_config_ulong(var, value, cctx->kvi);
1015+
return 0;
1016+
}
1017+
if (!strcmp(var, "survey.showtreesizes")) {
1018+
ctx->opts.show_largest_trees_by_size_bytes = git_config_ulong(var, value, cctx->kvi);
1019+
return 0;
1020+
}
1021+
if (!strcmp(var, "survey.showblobsizes")) {
1022+
ctx->opts.show_largest_blobs_by_size_bytes = git_config_ulong(var, value, cctx->kvi);
1023+
return 0;
1024+
}
8611025
if (!strcmp(var, "survey.top")) {
8621026
ctx->opts.top_nr = git_config_bool(var, value);
8631027
return 0;
@@ -1068,6 +1232,9 @@ static void increment_totals(struct survey_context *ctx,
10681232

10691233
ctx->report.reachable_objects.commits.parent_cnt_pbin[k]++;
10701234
base = &ctx->report.reachable_objects.commits.base;
1235+
1236+
maybe_insert_large_item(ctx->report.reachable_objects.commits.vec_largest_by_nr_parents, k, &commit->object.oid);
1237+
maybe_insert_large_item(ctx->report.reachable_objects.commits.vec_largest_by_size_bytes, object_length, &commit->object.oid);
10711238
break;
10721239
}
10731240
case OBJ_TREE: {
@@ -1087,8 +1254,8 @@ static void increment_totals(struct survey_context *ctx,
10871254

10881255
pst->sum_entries += nr_entries;
10891256

1090-
if (nr_entries > pst->max_entries)
1091-
pst->max_entries = nr_entries;
1257+
maybe_insert_large_item(pst->vec_largest_by_nr_entries, nr_entries, &tree->object.oid);
1258+
maybe_insert_large_item(pst->vec_largest_by_size_bytes, object_length, &tree->object.oid);
10921259

10931260
qb = qbin(nr_entries);
10941261
incr_obj_hist_bin(&pst->entry_qbin[qb], object_length, disk_sizep);
@@ -1098,6 +1265,8 @@ static void increment_totals(struct survey_context *ctx,
10981265
}
10991266
case OBJ_BLOB:
11001267
base = &ctx->report.reachable_objects.blobs.base;
1268+
1269+
maybe_insert_large_item(ctx->report.reachable_objects.blobs.vec_largest_by_size_bytes, object_length, &oids->oid[i]);
11011270
break;
11021271
default:
11031272
continue;
@@ -1304,6 +1473,14 @@ int cmd_survey(int argc, const char **argv, const char *prefix, struct repositor
13041473
OPT_BOOL_F(0, "detached", &ctx.opts.refs.want_detached, N_("include detached HEAD"), PARSE_OPT_NONEG),
13051474
OPT_BOOL_F(0, "other", &ctx.opts.refs.want_other, N_("include notes and stashes"), PARSE_OPT_NONEG),
13061475

1476+
OPT_INTEGER_F(0, "commit-parents", &ctx.opts.show_largest_commits_by_nr_parents, N_("show N largest commits by parent count"), PARSE_OPT_NONEG),
1477+
OPT_INTEGER_F(0, "commit-sizes", &ctx.opts.show_largest_commits_by_size_bytes, N_("show N largest commits by size in bytes"), PARSE_OPT_NONEG),
1478+
1479+
OPT_INTEGER_F(0, "tree-entries", &ctx.opts.show_largest_trees_by_nr_entries, N_("show N largest trees by entry count"), PARSE_OPT_NONEG),
1480+
OPT_INTEGER_F(0, "tree-sizes", &ctx.opts.show_largest_trees_by_size_bytes, N_("show N largest trees by size in bytes"), PARSE_OPT_NONEG),
1481+
1482+
OPT_INTEGER_F(0, "blob-sizes", &ctx.opts.show_largest_blobs_by_size_bytes, N_("show N largest blobs by size in bytes"), PARSE_OPT_NONEG),
1483+
13071484
OPT_END(),
13081485
};
13091486

@@ -1327,6 +1504,39 @@ int cmd_survey(int argc, const char **argv, const char *prefix, struct repositor
13271504

13281505
fixup_refs_wanted(&ctx);
13291506

1507+
if (ctx.opts.show_largest_commits_by_nr_parents)
1508+
ctx.report.reachable_objects.commits.vec_largest_by_nr_parents =
1509+
alloc_large_item_vec(
1510+
"largest_commits_by_nr_parents",
1511+
"nr_parents",
1512+
ctx.opts.show_largest_commits_by_nr_parents);
1513+
if (ctx.opts.show_largest_commits_by_size_bytes)
1514+
ctx.report.reachable_objects.commits.vec_largest_by_size_bytes =
1515+
alloc_large_item_vec(
1516+
"largest_commits_by_size_bytes",
1517+
"size",
1518+
ctx.opts.show_largest_commits_by_size_bytes);
1519+
1520+
if (ctx.opts.show_largest_trees_by_nr_entries)
1521+
ctx.report.reachable_objects.trees.vec_largest_by_nr_entries =
1522+
alloc_large_item_vec(
1523+
"largest_trees_by_nr_entries",
1524+
"nr_entries",
1525+
ctx.opts.show_largest_trees_by_nr_entries);
1526+
if (ctx.opts.show_largest_trees_by_size_bytes)
1527+
ctx.report.reachable_objects.trees.vec_largest_by_size_bytes =
1528+
alloc_large_item_vec(
1529+
"largest_trees_by_size_bytes",
1530+
"size",
1531+
ctx.opts.show_largest_trees_by_size_bytes);
1532+
1533+
if (ctx.opts.show_largest_blobs_by_size_bytes)
1534+
ctx.report.reachable_objects.blobs.vec_largest_by_size_bytes =
1535+
alloc_large_item_vec(
1536+
"largest_blobs_by_size_bytes",
1537+
"size",
1538+
ctx.opts.show_largest_blobs_by_size_bytes);
1539+
13301540
survey_phase_refs(&ctx);
13311541

13321542
survey_phase_objects(&ctx);

0 commit comments

Comments
 (0)