Skip to content
This repository was archived by the owner on Sep 30, 2024. It is now read-only.

Commit f61e637

Browse files
authored
feat(code insights): language stats speed improvements by using archive loading (#62946)
We previously improved the performance of Language Stats Insights by introducing parallel requests to gitserver: https://github.com/sourcegraph/sourcegraph/pull/62011 This PR replaces the previous approach where we would iterate through and request each file from gitserver with an approach where we request just one archive. This eliminates a lot of network traffic, and gives us an additional(!) performance improvement of 70-90%. Even repositories like chromium (42GB) can now be processed (on my machine in just one minute). --- Caching: We dropped most of the caching, and kept only the top-level caching (repo@commit). This means that we only need to compute the language stats once per commit, and subsequent users/requests can see the cached data. We dropped the file/directory level caching, because (1) the code to do that got very complex and (2) we can assume that most repositories are able to compute within the 5 minutes timeout (which can be increase via the environment variable `GET_INVENTORY_TIMEOUT`). The timeout is not bound to the user's request anymore. Before, the frontend would request the stats up to three times to let the computation move forward and pick up where the last request aborted. While we still have this frontend retry mechanism, we don't have to worry about an abort-and-continue mechanism in the backend. --- Credits for the code to @eseliger: https://github.com/sourcegraph/sourcegraph/issues/62019#issuecomment-2119278481 I've taken the diff, and updated the caching methods to allow for more advanced use cases should we decide to introduce more caching. We can take that out again if the current caching is sufficient. Todos: - [x] Check if CI passes, manual testing seems to be fine - [x] Verify that insights are cached at the top level --- Test data: - sourcegraph/sourcegraph: 9.07s (main) -> 1.44s (current): 74% better - facebook/react: 17.52s (main) -> 0.87s (current): 95% better - godotengine/godot: 28.92s (main) -> 1.98s (current): 93% better - chromium/chromium: ~1 minute: 100% better, because it didn't compute before ## Changelog - Language stats queries now request one archive from gitserver instead of individual file requests. This leads to a huge performance improvement. Even extra large repositories like chromium are now able to compute within one minute. Previously they timed out. ## Test plan - New unit tests - Plenty of manual testing
1 parent c38e0b1 commit f61e637

File tree

11 files changed

+629
-183
lines changed

11 files changed

+629
-183
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ All notable changes to Sourcegraph are documented in this file.
3737

3838
- The default and recommended chat model for Anthropic and Cody Gateway configurations is now `claude-3-sonnet-20240229`. [#62757](https://github.com/sourcegraph/sourcegraph/pull/62757)
3939
- The default and recommended autocomplete model for Cody Gateway configurations is now `fireworks/starcoder`. [#62757](https://github.com/sourcegraph/sourcegraph/pull/62757)
40+
- Code Insights: Language Stats Insights performance improved by another 70-90%. It's now able to handle repositories above 40 GB. [#62946](https://github.com/sourcegraph/sourcegraph/pull/62946)
4041
- The keyword search toggle has been removed from the search results page. [Keyword search](https://sourcegraph.com/docs/code-search/queries#keyword-search-default) is now enabled by default for all searches in the Sourcegraph web app. [#63584](https://github.com/sourcegraph/sourcegraph/pull/63584)
4142

4243
### Fixed

cmd/frontend/backend/inventory.go

Lines changed: 17 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -52,20 +52,16 @@ func InventoryContext(logger log.Logger, repo api.RepoName, gsClient gitserver.C
5252
return inventory.Context{}, errors.Errorf("refusing to compute inventory for non-absolute commit ID %q", commitID)
5353
}
5454

55-
cacheKey := func(e fs.FileInfo) string {
56-
info, ok := e.Sys().(gitdomain.ObjectInfo)
57-
if !ok {
58-
return "" // not cacheable
59-
}
60-
return info.OID().String()
61-
}
62-
6355
gitServerSemaphore := semaphore.NewWeighted(int64(gitServerConcurrency))
6456
redisSemaphore := semaphore.NewWeighted(int64(redisConcurrency))
6557

6658
logger = logger.Scoped("InventoryContext").
6759
With(log.String("repo", string(repo)), log.String("commitID", string(commitID)))
6860
invCtx := inventory.Context{
61+
Repo: repo,
62+
CommitID: commitID,
63+
ShouldSkipEnhancedLanguageDetection: !useEnhancedLanguageDetection && !forceEnhancedLanguageDetection,
64+
GitServerClient: gsClient,
6965
ReadTree: func(ctx context.Context, path string) ([]fs.FileInfo, error) {
7066
trc, ctx := trace.New(ctx, "ReadTree waits for semaphore")
7167
err := gitServerSemaphore.Acquire(ctx, 1)
@@ -109,55 +105,52 @@ func InventoryContext(logger log.Logger, repo api.RepoName, gsClient gitserver.C
109105
gitServerSemaphore.Release(1)
110106
}}, nil
111107
},
112-
CacheGet: func(ctx context.Context, e fs.FileInfo) (inventory.Inventory, bool) {
113-
cacheKey := cacheKey(e)
108+
CacheKey: func(e fs.FileInfo) string {
109+
info, ok := e.Sys().(gitdomain.ObjectInfo)
110+
if !ok {
111+
return "" // not cacheable
112+
}
113+
return info.OID().String()
114+
},
115+
CacheGet: func(ctx context.Context, cacheKey string) (inventory.Inventory, bool) {
114116
if cacheKey == "" {
115117
return inventory.Inventory{}, false // not cacheable
116118
}
117119

118120
if err := redisSemaphore.Acquire(ctx, 1); err != nil {
119-
logger.Warn("Failed to acquire semaphore for redis cache.", log.String("path", e.Name()), log.Error(err))
121+
logger.Warn("Failed to acquire semaphore for redis cache.", log.String("cacheKey", cacheKey), log.Error(err))
120122
return inventory.Inventory{}, false
121123
}
122124
defer redisSemaphore.Release(1)
123125

124126
if b, ok := inventoryCache.Get(cacheKey); ok {
125127
var inv inventory.Inventory
126128
if err := json.Unmarshal(b, &inv); err != nil {
127-
logger.Warn("Failed to unmarshal cached JSON inventory.", log.String("path", e.Name()), log.Error(err))
129+
logger.Warn("Failed to unmarshal cached JSON inventory.", log.String("cacheKey", cacheKey), log.Error(err))
128130
return inventory.Inventory{}, false
129131
}
130132
return inv, true
131133
}
132134
return inventory.Inventory{}, false
133135
},
134-
CacheSet: func(ctx context.Context, e fs.FileInfo, inv inventory.Inventory) {
135-
cacheKey := cacheKey(e)
136+
CacheSet: func(ctx context.Context, cacheKey string, inv inventory.Inventory) {
136137
if cacheKey == "" {
137138
return // not cacheable
138139
}
139140
b, err := json.Marshal(&inv)
140141
if err != nil {
141-
logger.Warn("Failed to marshal JSON inventory for cache.", log.String("path", e.Name()), log.Error(err))
142+
logger.Warn("Failed to marshal JSON inventory for cache.", log.String("cacheKey", cacheKey), log.Error(err))
142143
return
143144
}
144145

145146
if err := redisSemaphore.Acquire(ctx, 1); err != nil {
146-
logger.Warn("Failed to acquire semaphore for redis cache.", log.String("path", e.Name()), log.Error(err))
147+
logger.Warn("Failed to acquire semaphore for redis cache.", log.String("cacheKey", cacheKey), log.Error(err))
147148
return
148149
}
149150
defer redisSemaphore.Release(1)
150151
inventoryCache.Set(cacheKey, b)
151152
},
152153
}
153154

154-
if !useEnhancedLanguageDetection && !forceEnhancedLanguageDetection {
155-
// If USE_ENHANCED_LANGUAGE_DETECTION is disabled, do not read file contents to determine
156-
// the language. This means we won't calculate the number of lines per language.
157-
invCtx.NewFileReader = func(ctx context.Context, path string) (io.ReadCloser, error) {
158-
return nil, nil
159-
}
160-
}
161-
162155
return invCtx, nil
163156
}

cmd/frontend/backend/repos.go

Lines changed: 2 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,7 @@ package backend
33
import (
44
"context"
55
"fmt"
6-
"strconv"
7-
"time"
8-
9-
"github.com/sourcegraph/sourcegraph/internal/env"
10-
116
"github.com/sourcegraph/log"
12-
"go.opentelemetry.io/otel/attribute"
13-
147
"github.com/sourcegraph/sourcegraph/cmd/frontend/internal/inventory"
158
"github.com/sourcegraph/sourcegraph/internal/api"
169
"github.com/sourcegraph/sourcegraph/internal/database"
@@ -19,6 +12,7 @@ import (
1912
"github.com/sourcegraph/sourcegraph/internal/trace"
2013
"github.com/sourcegraph/sourcegraph/internal/types"
2114
"github.com/sourcegraph/sourcegraph/lib/errors"
15+
"go.opentelemetry.io/otel/attribute"
2216
)
2317

2418
type ReposService interface {
@@ -112,8 +106,6 @@ func (s *repos) ListIndexable(ctx context.Context) (repos []types.MinimalRepo, e
112106
})
113107
}
114108

115-
var getInventoryTimeout, _ = strconv.Atoi(env.Get("GET_INVENTORY_TIMEOUT", "5", "Time in minutes before cancelling getInventory requests. Raise this if your repositories are large and need a long time to process."))
116-
117109
func (s *repos) GetInventory(ctx context.Context, repo api.RepoName, commitID api.CommitID, forceEnhancedLanguageDetection bool) (res *inventory.Inventory, err error) {
118110
if Mocks.Repos.GetInventory != nil {
119111
return Mocks.Repos.GetInventory(ctx, repo, commitID)
@@ -122,24 +114,12 @@ func (s *repos) GetInventory(ctx context.Context, repo api.RepoName, commitID ap
122114
ctx, done := startTrace(ctx, "GetInventory", map[string]any{"repo": repo, "commitID": commitID}, &err)
123115
defer done()
124116

125-
ctx, cancel := context.WithTimeout(ctx, time.Duration(getInventoryTimeout)*time.Minute)
126-
defer cancel()
127-
128117
invCtx, err := InventoryContext(s.logger, repo, s.gitserverClient, commitID, forceEnhancedLanguageDetection)
129118
if err != nil {
130119
return nil, err
131120
}
132121

133-
root, err := s.gitserverClient.Stat(ctx, repo, commitID, "")
134-
if err != nil {
135-
return nil, err
136-
}
137-
138-
// In computing the inventory, sub-tree inventories are cached based on the OID of the Git
139-
// tree. Compared to per-blob caching, this creates many fewer cache entries, which means fewer
140-
// stores, fewer lookups, and less cache storage overhead. Compared to per-commit caching, this
141-
// yields a higher cache hit rate because most trees are unchanged across commits.
142-
inv, err := invCtx.Entries(ctx, root)
122+
inv, err := invCtx.All(ctx)
143123
if err != nil {
144124
return nil, err
145125
}

cmd/frontend/backend/repos_test.go

Lines changed: 53 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package backend
22

33
import (
4+
"archive/tar"
45
"bytes"
56
"context"
67
"flag"
@@ -69,6 +70,43 @@ func (oid gitObjectInfo) OID() gitdomain.OID {
6970
return v
7071
}
7172

73+
type FileData struct {
74+
Name string
75+
Content string
76+
}
77+
78+
// createInMemoryTarArchive creates a tar archive in memory containing multiple files with their given content.
79+
func createInMemoryTarArchive(files []FileData) ([]byte, error) {
80+
buf := new(bytes.Buffer)
81+
tarWriter := tar.NewWriter(buf)
82+
83+
for _, file := range files {
84+
header := &tar.Header{
85+
Name: file.Name,
86+
Size: int64(len(file.Content)),
87+
}
88+
89+
err := tarWriter.WriteHeader(header)
90+
if err != nil {
91+
return nil, err
92+
}
93+
94+
// Write the content to the tar archive.
95+
_, err = io.WriteString(tarWriter, file.Content)
96+
if err != nil {
97+
return nil, err
98+
}
99+
}
100+
101+
// Close the tar writer to flush the data to the buffer.
102+
err := tarWriter.Close()
103+
if err != nil {
104+
return nil, err
105+
}
106+
107+
return buf.Bytes(), nil
108+
}
109+
72110
func TestReposGetInventory(t *testing.T) {
73111
ctx := testContext()
74112

@@ -108,13 +146,27 @@ func TestReposGetInventory(t *testing.T) {
108146
switch name {
109147
case "b.go":
110148
data = []byte("package main")
111-
case "a/c.m":
149+
case "c.m":
112150
data = []byte("@interface X:NSObject {}")
113151
default:
114152
panic("unhandled mock ReadFile " + name)
115153
}
116154
return io.NopCloser(bytes.NewReader(data)), nil
117155
})
156+
gitserverClient.ArchiveReaderFunc.SetDefaultHook(func(_ context.Context, _ api.RepoName, archiveOptions gitserver.ArchiveOptions) (io.ReadCloser, error) {
157+
files := []FileData{
158+
{Name: "b.go", Content: "package main"},
159+
{Name: "a/c.m", Content: "@interface X:NSObject {}"},
160+
}
161+
162+
// Create the in-memory tar archive.
163+
archiveData, err := createInMemoryTarArchive(files)
164+
if err != nil {
165+
t.Fatalf("Failed to create in-memory tar archive: %v", err)
166+
}
167+
168+
return io.NopCloser(bytes.NewReader(archiveData)), nil
169+
})
118170
s := repos{
119171
logger: logtest.Scoped(t),
120172
gitserverClient: gitserverClient,

cmd/frontend/graphqlbackend/search_results_stats_languages_test.go

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package graphqlbackend
22

33
import (
4+
"archive/tar"
45
"bytes"
56
"context"
67
"io"
@@ -22,6 +23,43 @@ import (
2223
"github.com/sourcegraph/sourcegraph/internal/types"
2324
)
2425

26+
type FileData struct {
27+
Name string
28+
Content string
29+
}
30+
31+
// createInMemoryTarArchive creates a tar archive in memory containing multiple files with their given content.
32+
func createInMemoryTarArchive(files []FileData) ([]byte, error) {
33+
buf := new(bytes.Buffer)
34+
tarWriter := tar.NewWriter(buf)
35+
36+
for _, file := range files {
37+
header := &tar.Header{
38+
Name: file.Name,
39+
Size: int64(len(file.Content)),
40+
}
41+
42+
err := tarWriter.WriteHeader(header)
43+
if err != nil {
44+
return nil, err
45+
}
46+
47+
// Write the content to the tar archive.
48+
_, err = io.WriteString(tarWriter, file.Content)
49+
if err != nil {
50+
return nil, err
51+
}
52+
}
53+
54+
// Close the tar writer to flush the data to the buffer.
55+
err := tarWriter.Close()
56+
if err != nil {
57+
return nil, err
58+
}
59+
60+
return buf.Bytes(), nil
61+
}
62+
2563
func TestSearchResultsStatsLanguages(t *testing.T) {
2664
logger := logtest.Scoped(t)
2765
wantCommitID := api.CommitID(strings.Repeat("a", 40))
@@ -58,6 +96,20 @@ func TestSearchResultsStatsLanguages(t *testing.T) {
5896
gsClient.StatFunc.SetDefaultHook(func(_ context.Context, _ api.RepoName, _ api.CommitID, path string) (fs.FileInfo, error) {
5997
return &fileutil.FileInfo{Name_: path, Mode_: os.ModeDir}, nil
6098
})
99+
gsClient.ArchiveReaderFunc.SetDefaultHook(func(_ context.Context, _ api.RepoName, archiveOptions gitserver.ArchiveOptions) (io.ReadCloser, error) {
100+
files := []FileData{
101+
{Name: "two.go", Content: "a\nb\n"},
102+
{Name: "three.go", Content: "a\nb\nc\n"},
103+
}
104+
105+
// Create the in-memory tar archive.
106+
archiveData, err := createInMemoryTarArchive(files)
107+
if err != nil {
108+
t.Fatalf("Failed to create in-memory tar archive: %v", err)
109+
}
110+
111+
return io.NopCloser(bytes.NewReader(archiveData)), nil
112+
})
61113

62114
mkResult := func(path string, lineNumbers ...int) *result.FileMatch {
63115
rn := types.MinimalRepo{

cmd/frontend/internal/inventory/BUILD.bazel

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,14 +12,15 @@ go_library(
1212
tags = [TAG_SEARCHSUITE],
1313
visibility = ["//cmd/frontend:__subpackages__"],
1414
deps = [
15-
"//internal/trace",
15+
"//internal/api",
16+
"//internal/env",
17+
"//internal/gitserver",
1618
"//lib/codeintel/languages",
1719
"//lib/errors",
1820
"@com_github_go_enry_go_enry_v2//:go-enry",
1921
"@com_github_go_enry_go_enry_v2//data",
2022
"@com_github_sourcegraph_conc//iter",
2123
"@com_github_sourcegraph_log//:log",
22-
"@io_opentelemetry_go_otel//attribute",
2324
],
2425
)
2526

@@ -32,9 +33,7 @@ go_test(
3233
embed = [":inventory"],
3334
tags = [TAG_SEARCHSUITE],
3435
deps = [
35-
"//internal/fileutil",
3636
"//lib/errors",
3737
"@com_github_go_enry_go_enry_v2//:go-enry",
38-
"@com_github_google_go_cmp//cmp",
3938
],
4039
)

cmd/frontend/internal/inventory/context.go

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,23 @@ package inventory
22

33
import (
44
"context"
5+
"github.com/sourcegraph/sourcegraph/internal/gitserver"
56
"io"
67
"io/fs"
8+
9+
"github.com/sourcegraph/sourcegraph/internal/api"
710
)
811

912
// Context defines the environment in which the inventory is computed.
1013
type Context struct {
14+
Repo api.RepoName
15+
16+
CommitID api.CommitID
17+
18+
ShouldSkipEnhancedLanguageDetection bool
19+
20+
GitServerClient gitserver.Client
21+
1122
// ReadTree is called to list the immediate children of a tree at path. The returned fs.FileInfo
1223
// values' Name method must return the full path (that can be passed to another ReadTree or
1324
// ReadFile call), not just the basename.
@@ -16,9 +27,11 @@ type Context struct {
1627
// NewFileReader is called to get an io.ReadCloser from the file at path.
1728
NewFileReader func(ctx context.Context, path string) (io.ReadCloser, error)
1829

30+
CacheKey func(e fs.FileInfo) string
31+
1932
// CacheGet, if set, returns the cached inventory and true for the given tree, or false for a cache miss.
20-
CacheGet func(context.Context, fs.FileInfo) (Inventory, bool)
33+
CacheGet func(context.Context, string) (Inventory, bool)
2134

2235
// CacheSet, if set, stores the inventory in the cache for the given tree.
23-
CacheSet func(context.Context, fs.FileInfo, Inventory)
36+
CacheSet func(context.Context, string, Inventory)
2437
}

0 commit comments

Comments
 (0)