@@ -1211,20 +1211,87 @@ sub do_fetch {
1211
1211
sub mkemptydirs {
1212
1212
my ($self , $r ) = @_ ;
1213
1213
1214
+ # add/remove/collect a paths table
1215
+ #
1216
+ # Paths are split into a tree of nodes, stored as a hash of hashes.
1217
+ #
1218
+ # Each node contains a 'path' entry for the path (if any) associated
1219
+ # with that node and a 'children' entry for any nodes under that
1220
+ # location.
1221
+ #
1222
+ # Removing a path requires a hash lookup for each component then
1223
+ # dropping that node (and anything under it), which is substantially
1224
+ # faster than a grep slice into a single hash of paths for large
1225
+ # numbers of paths.
1226
+ #
1227
+ # For a large (200K) number of empty_dir directives this reduces
1228
+ # scanning time to 3 seconds vs 10 minutes for grep+delete on a single
1229
+ # hash of paths.
1230
+ sub add_path {
1231
+ my ($paths_table , $path ) = @_ ;
1232
+ my $node_ref ;
1233
+
1234
+ foreach my $x (split (' /' , $path )) {
1235
+ if (!exists ($paths_table -> {$x })) {
1236
+ $paths_table -> {$x } = { children => {} };
1237
+ }
1238
+
1239
+ $node_ref = $paths_table -> {$x };
1240
+ $paths_table = $paths_table -> {$x }-> {children };
1241
+ }
1242
+
1243
+ $node_ref -> {path } = $path ;
1244
+ }
1245
+
1246
+ sub remove_path {
1247
+ my ($paths_table , $path ) = @_ ;
1248
+ my $nodes_ref ;
1249
+ my $node_name ;
1250
+
1251
+ foreach my $x (split (' /' , $path )) {
1252
+ if (!exists ($paths_table -> {$x })) {
1253
+ return ;
1254
+ }
1255
+
1256
+ $nodes_ref = $paths_table ;
1257
+ $node_name = $x ;
1258
+
1259
+ $paths_table = $paths_table -> {$x }-> {children };
1260
+ }
1261
+
1262
+ delete ($nodes_ref -> {$node_name });
1263
+ }
1264
+
1265
+ sub collect_paths {
1266
+ my ($paths_table , $paths_ref ) = @_ ;
1267
+
1268
+ foreach my $v (values %$paths_table ) {
1269
+ my $p = $v -> {path };
1270
+ my $c = $v -> {children };
1271
+
1272
+ collect_paths($c , $paths_ref );
1273
+
1274
+ if (defined ($p )) {
1275
+ push (@$paths_ref , $p );
1276
+ }
1277
+ }
1278
+ }
1279
+
1214
1280
sub scan {
1215
- my ($r , $empty_dirs , $line ) = @_ ;
1281
+ my ($r , $paths_table , $line ) = @_ ;
1216
1282
if (defined $r && $line =~ / ^r(\d +)$ / ) {
1217
1283
return 0 if $1 > $r ;
1218
1284
} elsif ($line =~ / ^ \+ empty_dir: (.+)$ / ) {
1219
- $empty_dirs -> { $1 } = 1 ;
1285
+ add_path( $paths_table , $1 ) ;
1220
1286
} elsif ($line =~ / ^ \- empty_dir: (.+)$ / ) {
1221
- my @d = grep {m [^\Q$1 \E(/ |$) ]} (keys %$empty_dirs );
1222
- delete @$empty_dirs {@d };
1287
+ remove_path($paths_table , $1 );
1223
1288
}
1224
1289
1; # continue
1225
1290
};
1226
1291
1227
- my %empty_dirs = ();
1292
+ my @empty_dirs ;
1293
+ my %paths_table ;
1294
+
1228
1295
my $gz_file = " $self ->{dir}/unhandled.log.gz" ;
1229
1296
if (-f $gz_file ) {
1230
1297
if (!can_compress()) {
@@ -1235,7 +1302,7 @@ sub mkemptydirs {
1235
1302
die " Unable to open $gz_file : $! \n " ;
1236
1303
my $line ;
1237
1304
while ($gz -> gzreadline($line ) > 0) {
1238
- scan($r , \% empty_dirs , $line ) or last;
1305
+ scan($r , \%paths_table , $line ) or last ;
1239
1306
}
1240
1307
$gz -> gzclose;
1241
1308
}
@@ -1244,13 +1311,14 @@ sub mkemptydirs {
1244
1311
if (open my $fh , ' <' , " $self ->{dir}/unhandled.log" ) {
1245
1312
binmode $fh or croak " binmode: $! " ;
1246
1313
while (<$fh >) {
1247
- scan($r , \% empty_dirs , $_ ) or last;
1314
+ scan($r , \%paths_table , $_ ) or last ;
1248
1315
}
1249
1316
close $fh ;
1250
1317
}
1251
1318
1319
+ collect_paths(\%paths_table , \@empty_dirs );
1252
1320
my $strip = qr /\A\Q @{[$self ->path]}\E (?:\/ |$ )/ ;
1253
- foreach my $d (sort keys % empty_dirs ) {
1321
+ foreach my $d (sort @ empty_dirs ) {
1254
1322
$d = uri_decode($d );
1255
1323
$d =~ s / $strip// ;
1256
1324
next unless length ($d );
0 commit comments