Skip to content

Commit 50e7454

Browse files
authored
Pujol cg 10460 make codebasefetch codebase work on the sdk (#50)
# Motivation Allows us to fetch remote codebases - [ x] I have added tests for my changes - [ x] I have updated the documentation or added new documentation as needed - [x ] I have read and agree to the [Contributor License Agreement](../CLA.md) --------- Co-authored-by: KopekC <[email protected]>
1 parent 0ee52a0 commit 50e7454

File tree

4 files changed

+102
-79
lines changed

4 files changed

+102
-79
lines changed

docs/building-with-codegen/parsing-codebases.mdx

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -34,20 +34,20 @@ codebase = Codebase("./")
3434

3535
## Remote Repositories
3636

37-
To fetch and parse a repository directly from GitHub, use the `fetch_codebase` function.
37+
To fetch and parse a repository directly from GitHub, use the `from_repo` function.
3838

3939
```python
4040
import codegen
4141

4242
# Fetch and parse a repository (defaults to /tmp/codegen/{repo_name})
43-
codebase = codegen.fetch_codebase('fastapi/fastapi')
43+
codebase = codegen.from_repo('fastapi/fastapi')
4444

4545
# Customize temp directory, clone depth, or specific commit
46-
codebase = codegen.fetch_codebase(
46+
codebase = codegen.from_repo(
4747
'fastapi/fastapi',
4848
tmp_dir='/custom/temp/dir', # Optional: custom temp directory
49+
commit='786a8ada7ed0c7f9d8b04d49f24596865e4b7901',
4950
shallow=False, # Optional: full clone instead of shallow
50-
commit_hash='fe513719ea98abade167d8a89e92f600d9d8f0e5' # Optional: specific commit
5151
)
5252
```
5353

src/codegen/git/repo_operator/local_repo_operator.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,49 @@ def create_from_commit(cls, repo_path: str, default_branch: str, commit: str, ur
8282
op.checkout_commit(commit)
8383
return op
8484

85+
@classmethod
86+
def create_from_repo(cls, repo_path: str, url: str) -> Self:
87+
"""Create a fresh clone of a repository or use existing one if up to date.
88+
89+
Args:
90+
repo_path (str): Path where the repo should be cloned
91+
url (str): Git URL of the repository
92+
"""
93+
# Check if repo already exists
94+
if os.path.exists(repo_path):
95+
try:
96+
# Try to initialize git repo from existing path
97+
git_cli = GitCLI(repo_path)
98+
# Check if it has our remote URL
99+
if any(remote.url == url for remote in git_cli.remotes):
100+
# Fetch to check for updates
101+
git_cli.remotes.origin.fetch()
102+
# Get current and remote HEADs
103+
local_head = git_cli.head.commit
104+
remote_head = git_cli.remotes.origin.refs[git_cli.active_branch.name].commit
105+
# If up to date, use existing repo
106+
if local_head.hexsha == remote_head.hexsha:
107+
default_branch = git_cli.active_branch.name
108+
return cls(repo_config=BaseRepoConfig(), repo_path=repo_path, default_branch=default_branch, bot_commit=False)
109+
except Exception:
110+
# If any git operations fail, fallback to fresh clone
111+
pass
112+
113+
# If we get here, repo exists but is not up to date or valid
114+
# Remove the existing directory to do a fresh clone
115+
import shutil
116+
117+
shutil.rmtree(repo_path)
118+
119+
# Do a fresh clone with depth=1 to get latest commit
120+
GitCLI.clone_from(url=url, to_path=repo_path, depth=1)
121+
122+
# Initialize with the cloned repo
123+
git_cli = GitCLI(repo_path)
124+
default_branch = git_cli.active_branch.name
125+
126+
return cls(repo_config=BaseRepoConfig(), repo_path=repo_path, default_branch=default_branch, bot_commit=False)
127+
85128
####################################################################################################################
86129
# PROPERTIES
87130
####################################################################################################################

src/codegen/sdk/core/codebase.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1085,6 +1085,61 @@ def set_session_options(self, **kwargs: Unpack[SessionOptions]) -> None:
10851085
self.G.transaction_manager.set_max_transactions(self.G.session_options.max_transactions)
10861086
self.G.transaction_manager.reset_stopwatch(self.G.session_options.max_seconds)
10871087

1088+
@classmethod
1089+
def from_repo(cls, repo_name: str, *, tmp_dir: str | None = None, commit: str | None = None, shallow: bool = True) -> "Codebase":
1090+
"""Fetches a codebase from GitHub and returns a Codebase instance.
1091+
1092+
Args:
1093+
repo_name (str): The name of the repository in format "owner/repo"
1094+
tmp_dir (Optional[str]): The directory to clone the repo into. Defaults to /tmp/codegen
1095+
commit (Optional[str]): The specific commit hash to clone. Defaults to HEAD
1096+
shallow (bool): Whether to do a shallow clone. Defaults to True
1097+
Returns:
1098+
Codebase: A Codebase instance initialized with the cloned repository
1099+
"""
1100+
logger.info(f"Fetching codebase for {repo_name}")
1101+
1102+
# Parse repo name
1103+
if "/" not in repo_name:
1104+
raise ValueError("repo_name must be in format 'owner/repo'")
1105+
owner, repo = repo_name.split("/")
1106+
1107+
# Setup temp directory
1108+
if tmp_dir is None:
1109+
tmp_dir = "/tmp/codegen"
1110+
os.makedirs(tmp_dir, exist_ok=True)
1111+
logger.info(f"Using directory: {tmp_dir}")
1112+
1113+
# Setup repo path and URL
1114+
repo_path = os.path.join(tmp_dir, repo)
1115+
repo_url = f"https://github.com/{repo_name}.git"
1116+
logger.info(f"Will clone {repo_url} to {repo_path}")
1117+
1118+
try:
1119+
# Use LocalRepoOperator to fetch the repository
1120+
logger.info("Cloning repository...")
1121+
if commit is None:
1122+
repo_operator = LocalRepoOperator.create_from_repo(repo_path=repo_path, url=repo_url)
1123+
else:
1124+
# Ensure the operator can handle remote operations
1125+
repo_operator = LocalRepoOperator.create_from_commit(
1126+
repo_path=repo_path,
1127+
default_branch="main", # We'll get the actual default branch after clone
1128+
commit=commit,
1129+
url=repo_url,
1130+
)
1131+
logger.info("Clone completed successfully")
1132+
1133+
# Initialize and return codebase with proper context
1134+
logger.info("Initializing Codebase...")
1135+
project = ProjectConfig(repo_operator=repo_operator, programming_language=determine_project_language(repo_path))
1136+
codebase = Codebase(projects=[project], config=DefaultConfig)
1137+
logger.info("Codebase initialization complete")
1138+
return codebase
1139+
except Exception as e:
1140+
logger.error(f"Failed to initialize codebase: {e}")
1141+
raise
1142+
10881143

10891144
# The last 2 lines of code are added to the runner. See codegen-backend/cli/generate/utils.py
10901145
# Type Aliases

src/codegen/sdk/fetch_codebase.py

Lines changed: 0 additions & 75 deletions
This file was deleted.

0 commit comments

Comments
 (0)