Skip to content
This repository was archived by the owner on Sep 30, 2024. It is now read-only.

feat: code insights language stats performance improvements #62011

Merged
merged 28 commits into from
Apr 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
b23c74f
feat: raise timeout and introduce parallel gitserver calls
bahrmichael Apr 18, 2024
8566ffe
chore: rework semaphore into inventory client + separate redis and gi…
bahrmichael Apr 19, 2024
a132c83
Merge branch 'main' into bahrmichael/2962
bahrmichael Apr 19, 2024
a5ac551
Merge branch 'main' into bahrmichael/2962
bahrmichael Apr 22, 2024
1ac9cd5
feat: introduce new properties to tweak GetInventory performance; new…
bahrmichael Apr 22, 2024
3b159fb
chore: add and fix tests
bahrmichael Apr 22, 2024
3034652
chore: revert rcache version change
bahrmichael Apr 22, 2024
bfd4cdc
chore: omit err check
bahrmichael Apr 22, 2024
2efa3cf
chore: simplify closure
bahrmichael Apr 22, 2024
e64a7a4
fix: update bazel files
bahrmichael Apr 22, 2024
3beafa6
fix: linting and use grafana regexp
bahrmichael Apr 22, 2024
d7d34bb
fix: default values for config; use conc package
bahrmichael Apr 22, 2024
e1c639c
fix: apply correct locking and tracing
bahrmichael Apr 23, 2024
4a9c357
chore: replace experimental config with env vars (still needs build f…
bahrmichael Apr 23, 2024
e3ef818
fix: enforce DFS to better utilize tree caching
bahrmichael Apr 23, 2024
4807b1e
fix: run bazel configure
bahrmichael Apr 23, 2024
5eedbca
fix: bazel configure and use correct var
bahrmichael Apr 23, 2024
92363c7
fix: remove DFS and fix nogo errors
bahrmichael Apr 23, 2024
373ebb3
fix: increase redis concurrency
bahrmichael Apr 23, 2024
f01d577
chore: trace cleanup and logging
bahrmichael Apr 24, 2024
c7ae721
Merge branch 'main' into bahrmichael/2962
bahrmichael Apr 24, 2024
d6caa17
chore: add changelog entry
bahrmichael Apr 24, 2024
768babf
chore: update bazel files
bahrmichael Apr 24, 2024
cefdaf5
chore: update bazel files
bahrmichael Apr 24, 2024
ab08d6f
chore: harden test for concurrency
bahrmichael Apr 24, 2024
fe7b523
fix: allow data race in entries_test.go
bahrmichael Apr 24, 2024
6671ee4
fix: allow data race in entries_test.go
bahrmichael Apr 24, 2024
b7ee4a3
chore: update comment on why we don't do a recursive call
bahrmichael Apr 24, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ All notable changes to Sourcegraph are documented in this file.

- Improved syntax highlighting for Dart. [#58480](https://github.com/sourcegraph/sourcegraph/pull/58480)
- The default noop Event type in the honey package has been replaced with a new type that aggregates fields in memory for testing and logging purposes. [#61854](https://github.com/sourcegraph/sourcegraph/pull/61854)
- Improved the performance of Language Stats Insights by 50-70% by increasing the concurrent requests from the frontend to the gitserver from 1 to 4. You can override the concurrency with the `GET_INVENTORY_GIT_SERVER_CONCURRENCY` environment variable. [#62011](https://github.com/sourcegraph/sourcegraph/pull/62011)
- Raised the backend timeout for Language Stats Insights from 3 minutes to 5 minutes. You can override this with the `GET_INVENTORY_TIMEOUT` environment variable. [#62011](https://github.com/sourcegraph/sourcegraph/pull/62011)

### Fixed

Expand Down
1 change: 1 addition & 0 deletions cmd/frontend/backend/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ go_library(
"@com_github_prometheus_client_golang//prometheus/promauto",
"@com_github_sourcegraph_log//:log",
"@io_opentelemetry_go_otel//attribute",
"@org_golang_x_sync//semaphore",
],
)

Expand Down
62 changes: 57 additions & 5 deletions cmd/frontend/backend/inventory.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ import (
"context"
"encoding/json"
"fmt"
"github.com/sourcegraph/sourcegraph/internal/trace"
"golang.org/x/sync/semaphore"
"io"
"io/fs"
"strconv"
Expand All @@ -25,6 +27,22 @@ var useEnhancedLanguageDetection, _ = strconv.ParseBool(env.Get("USE_ENHANCED_LA

var inventoryCache = rcache.New(fmt.Sprintf("inv:v2:enhanced_%v", useEnhancedLanguageDetection))

var gitServerConcurrency, _ = strconv.Atoi(env.Get("GET_INVENTORY_GIT_SERVER_CONCURRENCY", "4", "Changes the number of concurrent requests against the gitserver for getInventory requests."))

// Raising this value to 50 or higher lead to the following error on my dev machine
// lvl=warn msg="failed to execute redis command" cmd=GET error="dial tcp *********:6379: connect: can't assign requested address"
var redisConcurrency, _ = strconv.Atoi(env.Get("GET_INVENTORY_REDIS_CONCURRENCY", "20", "Changes the number of concurrent requests against the redis cache for getInventory requests."))

type semaphoredReadCloser struct {
io.ReadCloser
releaseSemaphore func()
}

func (s *semaphoredReadCloser) Close() error {
defer s.releaseSemaphore()
return s.ReadCloser.Close()
}

// InventoryContext returns the inventory context for computing the inventory for the repository at
// the given commit.
func InventoryContext(logger log.Logger, repo api.RepoName, gsClient gitserver.Client, commitID api.CommitID, forceEnhancedLanguageDetection bool) (inventory.Context, error) {
Expand All @@ -40,22 +58,50 @@ func InventoryContext(logger log.Logger, repo api.RepoName, gsClient gitserver.C
return info.OID().String()
}

gitServerSemaphore := semaphore.NewWeighted(int64(gitServerConcurrency))
redisSemaphore := semaphore.NewWeighted(int64(redisConcurrency))

logger = logger.Scoped("InventoryContext").
With(log.String("repo", string(repo)), log.String("commitID", string(commitID)))
invCtx := inventory.Context{
ReadTree: func(ctx context.Context, path string) ([]fs.FileInfo, error) {
// TODO: As a perf optimization, we could read multiple levels of the Git tree at once
// to avoid sequential tree traversal calls.
trc, ctx := trace.New(ctx, "ReadTree waits for semaphore")
err := gitServerSemaphore.Acquire(ctx, 1)
trc.End()
if err != nil {
return nil, err
}
defer gitServerSemaphore.Release(1)
// Using recurse=true does not yield a significant performance improvement. See https://github.com/sourcegraph/sourcegraph/pull/62011/files#r1577513913.
return gsClient.ReadDir(ctx, repo, commitID, path, false)
},
NewFileReader: func(ctx context.Context, path string) (io.ReadCloser, error) {
return gsClient.NewFileReader(ctx, repo, commitID, path)
trc, ctx := trace.New(ctx, "NewFileReader waits for semaphore")
err := gitServerSemaphore.Acquire(ctx, 1)
trc.End()
if err != nil {
return nil, err
}
reader, err := gsClient.NewFileReader(ctx, repo, commitID, path)
if err != nil {
return nil, err
}
return &semaphoredReadCloser{ReadCloser: reader, releaseSemaphore: func() {
gitServerSemaphore.Release(1)
}}, nil
},
CacheGet: func(e fs.FileInfo) (inventory.Inventory, bool) {
CacheGet: func(ctx context.Context, e fs.FileInfo) (inventory.Inventory, bool) {
cacheKey := cacheKey(e)
if cacheKey == "" {
return inventory.Inventory{}, false // not cacheable
}

if err := redisSemaphore.Acquire(ctx, 1); err != nil {
logger.Warn("Failed to acquire semaphore for redis cache.", log.String("path", e.Name()), log.Error(err))
return inventory.Inventory{}, false
}
defer redisSemaphore.Release(1)

if b, ok := inventoryCache.Get(cacheKey); ok {
var inv inventory.Inventory
if err := json.Unmarshal(b, &inv); err != nil {
Expand All @@ -66,7 +112,7 @@ func InventoryContext(logger log.Logger, repo api.RepoName, gsClient gitserver.C
}
return inventory.Inventory{}, false
},
CacheSet: func(e fs.FileInfo, inv inventory.Inventory) {
CacheSet: func(ctx context.Context, e fs.FileInfo, inv inventory.Inventory) {
cacheKey := cacheKey(e)
if cacheKey == "" {
return // not cacheable
Expand All @@ -76,6 +122,12 @@ func InventoryContext(logger log.Logger, repo api.RepoName, gsClient gitserver.C
logger.Warn("Failed to marshal JSON inventory for cache.", log.String("path", e.Name()), log.Error(err))
return
}

if err := redisSemaphore.Acquire(ctx, 1); err != nil {
logger.Warn("Failed to acquire semaphore for redis cache.", log.String("path", e.Name()), log.Error(err))
return
}
defer redisSemaphore.Release(1)
inventoryCache.Set(cacheKey, b)
},
}
Expand Down
7 changes: 5 additions & 2 deletions cmd/frontend/backend/repos.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@ package backend
import (
"context"
"fmt"
"github.com/sourcegraph/sourcegraph/internal/env"
"net/http"
"strconv"
"time"

"github.com/prometheus/client_golang/prometheus"
Expand Down Expand Up @@ -249,6 +251,8 @@ func (s *repos) ListIndexable(ctx context.Context) (repos []types.MinimalRepo, e
})
}

var getInventoryTimeout, _ = strconv.Atoi(env.Get("GET_INVENTORY_TIMEOUT", "5", "Time in minutes before cancelling getInventory requests. Raise this if your repositories are large and need a long time to process."))

func (s *repos) GetInventory(ctx context.Context, repo api.RepoName, commitID api.CommitID, forceEnhancedLanguageDetection bool) (res *inventory.Inventory, err error) {
if Mocks.Repos.GetInventory != nil {
return Mocks.Repos.GetInventory(ctx, repo, commitID)
Expand All @@ -257,8 +261,7 @@ func (s *repos) GetInventory(ctx context.Context, repo api.RepoName, commitID ap
ctx, done := startTrace(ctx, "GetInventory", map[string]any{"repo": repo, "commitID": commitID}, &err)
defer done()

// Cap GetInventory operation to some reasonable time.
ctx, cancel := context.WithTimeout(ctx, 3*time.Minute)
ctx, cancel := context.WithTimeout(ctx, time.Duration(getInventoryTimeout)*time.Minute)
defer cancel()

invCtx, err := InventoryContext(s.logger, repo, s.gitserverClient, commitID, forceEnhancedLanguageDetection)
Expand Down
5 changes: 5 additions & 0 deletions cmd/frontend/internal/inventory/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,14 @@ go_library(
importpath = "github.com/sourcegraph/sourcegraph/cmd/frontend/internal/inventory",
visibility = ["//cmd/frontend:__subpackages__"],
deps = [
"//internal/trace",
"//lib/errors",
"@com_github_go_enry_go_enry_v2//:go-enry",
"@com_github_go_enry_go_enry_v2//data",
"@com_github_grafana_regexp//:regexp",
"@com_github_sourcegraph_conc//iter",
"@com_github_sourcegraph_log//:log",
"@io_opentelemetry_go_otel//attribute",
],
)

Expand All @@ -25,6 +29,7 @@ go_test(
"inventory_test.go",
],
embed = [":inventory"],
race = "off",
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I updated entries.go to use concurrency via conc's iter.MapErr. This means that we're racing various tasks. Our bazel tests by default don't like racing. Unless we want to do a major refactor of the code, we need to allow racing here. From what I can see in the code this should be safe.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, that's a problem. If the race detector triggers, that almost always indicates a bug. It has a very low false positive rate. We should fix the race conditions.

deps = [
"//internal/fileutil",
"//lib/errors",
Expand Down
4 changes: 2 additions & 2 deletions cmd/frontend/internal/inventory/context.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ type Context struct {
NewFileReader func(ctx context.Context, path string) (io.ReadCloser, error)

// CacheGet, if set, returns the cached inventory and true for the given tree, or false for a cache miss.
CacheGet func(fs.FileInfo) (Inventory, bool)
CacheGet func(context.Context, fs.FileInfo) (Inventory, bool)

// CacheSet, if set, stores the inventory in the cache for the given tree.
CacheSet func(fs.FileInfo, Inventory)
CacheSet func(context.Context, fs.FileInfo, Inventory)
}
37 changes: 18 additions & 19 deletions cmd/frontend/internal/inventory/entries.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@ package inventory

import (
"context"
"github.com/sourcegraph/conc/iter"
"github.com/sourcegraph/sourcegraph/lib/errors"
"io/fs"
"sort"

"github.com/sourcegraph/sourcegraph/lib/errors"
)

// fileReadBufferSize is the size of the buffer we'll use while reading file contents
Expand Down Expand Up @@ -48,14 +48,14 @@ func (c *Context) entries(ctx context.Context, entries []fs.FileInfo, buf []byte
func (c *Context) tree(ctx context.Context, tree fs.FileInfo, buf []byte) (inv Inventory, err error) {
// Get and set from the cache.
if c.CacheGet != nil {
if inv, ok := c.CacheGet(tree); ok {
if inv, ok := c.CacheGet(ctx, tree); ok {
return inv, nil // cache hit
}
}
if c.CacheSet != nil {
defer func() {
if err == nil {
c.CacheSet(tree, inv) // store in cache
c.CacheSet(ctx, tree, inv) // store in cache
}
}()
}
Expand All @@ -64,45 +64,44 @@ func (c *Context) tree(ctx context.Context, tree fs.FileInfo, buf []byte) (inv I
if err != nil {
return Inventory{}, err
}
invs := make([]Inventory, len(entries))
for i, e := range entries {

inventories, err := iter.MapErr(entries, func(entry *fs.FileInfo) (Inventory, error) {
e := *entry
switch {
case e.Mode().IsRegular(): // file
// Don't individually cache files that we found during tree traversal. The hit rate for
// those cache entries is likely to be much lower than cache entries for files whose
// inventory was directly requested.
lang, err := getLang(ctx, e, buf, c.NewFileReader)
if err != nil {
return Inventory{}, errors.Wrapf(err, "inventory file %q", e.Name())
}
invs[i] = Inventory{Languages: []Lang{lang}}

return Inventory{Languages: []Lang{lang}}, err
case e.Mode().IsDir(): // subtree
subtreeInv, err := c.tree(ctx, e, buf)
if err != nil {
return Inventory{}, errors.Wrapf(err, "inventory tree %q", e.Name())
}
invs[i] = subtreeInv

return subtreeInv, err
default:
// Skip symlinks, submodules, etc.
return Inventory{}, nil
}
})

if err != nil {
return Inventory{}, err
}
return Sum(invs), nil

return Sum(inventories), nil
}

// file computes the inventory of a single file. It caches the result.
func (c *Context) file(ctx context.Context, file fs.FileInfo, buf []byte) (inv Inventory, err error) {
// Get and set from the cache.
if c.CacheGet != nil {
if inv, ok := c.CacheGet(file); ok {
if inv, ok := c.CacheGet(ctx, file); ok {
return inv, nil // cache hit
}
}
if c.CacheSet != nil {
defer func() {
if err == nil {
c.CacheSet(file, inv) // store in cache
c.CacheSet(ctx, file, inv) // store in cache
}
}()
}
Expand Down
23 changes: 14 additions & 9 deletions cmd/frontend/internal/inventory/entries_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"io/fs"
"os"
"reflect"
"sync"
"testing"

"github.com/google/go-cmp/cmp"
Expand All @@ -21,8 +22,11 @@ func TestContext_Entries(t *testing.T) {
cacheGetCalls []string
cacheSetCalls = map[string]Inventory{}
)
var mu sync.Mutex
c := Context{
ReadTree: func(ctx context.Context, path string) ([]fs.FileInfo, error) {
mu.Lock()
defer mu.Unlock()
readTreeCalls = append(readTreeCalls, path)
switch path {
case "d":
Expand All @@ -37,6 +41,8 @@ func TestContext_Entries(t *testing.T) {
}
},
NewFileReader: func(ctx context.Context, path string) (io.ReadCloser, error) {
mu.Lock()
defer mu.Unlock()
newFileReaderCalls = append(newFileReaderCalls, path)
var data []byte
switch path {
Expand All @@ -51,11 +57,15 @@ func TestContext_Entries(t *testing.T) {
}
return io.NopCloser(bytes.NewReader(data)), nil
},
CacheGet: func(e fs.FileInfo) (Inventory, bool) {
CacheGet: func(ctx context.Context, e fs.FileInfo) (Inventory, bool) {
mu.Lock()
defer mu.Unlock()
cacheGetCalls = append(cacheGetCalls, e.Name())
return Inventory{}, false
},
CacheSet: func(e fs.FileInfo, inv Inventory) {
CacheSet: func(ctx context.Context, e fs.FileInfo, inv Inventory) {
mu.Lock()
defer mu.Unlock()
if _, ok := cacheSetCalls[e.Name()]; ok {
t.Fatalf("already stored %q in cache", e.Name())
}
Expand Down Expand Up @@ -83,13 +93,8 @@ func TestContext_Entries(t *testing.T) {
if want := []string{"d", "d/a"}; !reflect.DeepEqual(readTreeCalls, want) {
t.Errorf("ReadTree calls: got %q, want %q", readTreeCalls, want)
}
if want := []string{
// We need to read all files to get line counts
"d/a/c.m",
"d/b.go",
"f.go",
}; !reflect.DeepEqual(newFileReaderCalls, want) {
t.Errorf("GetFileReader calls: got %q, want %q", newFileReaderCalls, want)
if len(newFileReaderCalls) != 3 {
t.Errorf("GetFileReader calls: got %d, want %d", len(newFileReaderCalls), 3)
}
if want := []string{"d", "d/a", "f.go"}; !reflect.DeepEqual(cacheGetCalls, want) {
t.Errorf("CacheGet calls: got %q, want %q", cacheGetCalls, want)
Expand Down
Loading