Skip to content

Pujol cg 10460 make codebasefetch codebase work on the sdk #50

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions docs/building-with-codegen/parsing-codebases.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -34,20 +34,20 @@ codebase = Codebase("./")

## Remote Repositories

To fetch and parse a repository directly from GitHub, use the `fetch_codebase` function.
To fetch and parse a repository directly from GitHub, use the `from_repo` function.

```python
import codegen

# Fetch and parse a repository (defaults to /tmp/codegen/{repo_name})
codebase = codegen.fetch_codebase('fastapi/fastapi')
codebase = codegen.from_repo('fastapi/fastapi')

# Customize temp directory, clone depth, or specific commit
codebase = codegen.fetch_codebase(
codebase = codegen.from_repo(
'fastapi/fastapi',
tmp_dir='/custom/temp/dir', # Optional: custom temp directory
commit='786a8ada7ed0c7f9d8b04d49f24596865e4b7901',
shallow=False, # Optional: full clone instead of shallow
commit_hash='fe513719ea98abade167d8a89e92f600d9d8f0e5' # Optional: specific commit
)
```

Expand Down
43 changes: 43 additions & 0 deletions src/codegen/git/repo_operator/local_repo_operator.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,49 @@ def create_from_commit(cls, repo_path: str, default_branch: str, commit: str, ur
op.checkout_commit(commit)
return op

@classmethod
def create_from_repo(cls, repo_path: str, url: str) -> Self:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why does this not take in a commit?

"""Create a fresh clone of a repository or use existing one if up to date.

Args:
repo_path (str): Path where the repo should be cloned
url (str): Git URL of the repository
"""
# Check if repo already exists
if os.path.exists(repo_path):
try:
# Try to initialize git repo from existing path
git_cli = GitCLI(repo_path)
# Check if it has our remote URL
if any(remote.url == url for remote in git_cli.remotes):
# Fetch to check for updates
git_cli.remotes.origin.fetch()
# Get current and remote HEADs
local_head = git_cli.head.commit
remote_head = git_cli.remotes.origin.refs[git_cli.active_branch.name].commit
# If up to date, use existing repo
if local_head.hexsha == remote_head.hexsha:
default_branch = git_cli.active_branch.name
return cls(repo_config=BaseRepoConfig(), repo_path=repo_path, default_branch=default_branch, bot_commit=False)
except Exception:
# If any git operations fail, fallback to fresh clone
pass

# If we get here, repo exists but is not up to date or valid
# Remove the existing directory to do a fresh clone
import shutil

shutil.rmtree(repo_path)

# Do a fresh clone with depth=1 to get latest commit
GitCLI.clone_from(url=url, to_path=repo_path, depth=1)

# Initialize with the cloned repo
git_cli = GitCLI(repo_path)
default_branch = git_cli.active_branch.name

return cls(repo_config=BaseRepoConfig(), repo_path=repo_path, default_branch=default_branch, bot_commit=False)

####################################################################################################################
# PROPERTIES
####################################################################################################################
Expand Down
55 changes: 55 additions & 0 deletions src/codegen/sdk/core/codebase.py
Original file line number Diff line number Diff line change
Expand Up @@ -1085,6 +1085,61 @@ def set_session_options(self, **kwargs: Unpack[SessionOptions]) -> None:
self.G.transaction_manager.set_max_transactions(self.G.session_options.max_transactions)
self.G.transaction_manager.reset_stopwatch(self.G.session_options.max_seconds)

@classmethod
def from_repo(cls, repo_name: str, *, tmp_dir: str | None = None, commit: str | None = None, shallow: bool = True) -> "Codebase":
"""Fetches a codebase from GitHub and returns a Codebase instance.

Args:
repo_name (str): The name of the repository in format "owner/repo"
tmp_dir (Optional[str]): The directory to clone the repo into. Defaults to /tmp/codegen
commit (Optional[str]): The specific commit hash to clone. Defaults to HEAD
shallow (bool): Whether to do a shallow clone. Defaults to True
Returns:
Codebase: A Codebase instance initialized with the cloned repository
"""
logger.info(f"Fetching codebase for {repo_name}")

# Parse repo name
if "/" not in repo_name:
raise ValueError("repo_name must be in format 'owner/repo'")
owner, repo = repo_name.split("/")

# Setup temp directory
if tmp_dir is None:
tmp_dir = "/tmp/codegen"
os.makedirs(tmp_dir, exist_ok=True)
logger.info(f"Using directory: {tmp_dir}")

# Setup repo path and URL
repo_path = os.path.join(tmp_dir, repo)
repo_url = f"https://github.com/{repo_name}.git"
logger.info(f"Will clone {repo_url} to {repo_path}")

try:
# Use LocalRepoOperator to fetch the repository
logger.info("Cloning repository...")
if commit is None:
repo_operator = LocalRepoOperator.create_from_repo(repo_path=repo_path, url=repo_url)
else:
# Ensure the operator can handle remote operations
repo_operator = LocalRepoOperator.create_from_commit(
repo_path=repo_path,
default_branch="main", # We'll get the actual default branch after clone
commit=commit,
url=repo_url,
)
logger.info("Clone completed successfully")

# Initialize and return codebase with proper context
logger.info("Initializing Codebase...")
project = ProjectConfig(repo_operator=repo_operator, programming_language=determine_project_language(repo_path))
codebase = Codebase(projects=[project], config=DefaultConfig)
logger.info("Codebase initialization complete")
return codebase
except Exception as e:
logger.error(f"Failed to initialize codebase: {e}")
raise


# The last 2 lines of code are added to the runner. See codegen-backend/cli/generate/utils.py
# Type Aliases
Expand Down
75 changes: 0 additions & 75 deletions src/codegen/sdk/fetch_codebase.py

This file was deleted.

Loading