-
Notifications
You must be signed in to change notification settings - Fork 299
Add multi-model smoke tests #457
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
2 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,179 @@ | ||
name: test | ||
|
||
on: | ||
pull_request_target: | ||
types: [opened, synchronize, reopened] | ||
branches: | ||
- main | ||
push: | ||
branches: | ||
- main | ||
paths-ignore: | ||
- docs/** | ||
workflow_dispatch: | ||
|
||
jobs: | ||
check-label: | ||
runs-on: ubuntu-22.04 | ||
outputs: | ||
run_smoke_tests: ${{ steps.check.outputs.run_smoke_tests }} | ||
steps: | ||
- name: Check if PR author is a member of the organization or has the run-smoke label | ||
id: check | ||
run: | | ||
case "${{ github.event_name }}" in | ||
push) | ||
# Run smoke tests for push to base repo | ||
echo "run_smoke_tests=true" >> $GITHUB_OUTPUT | ||
exit 0 | ||
;; | ||
workflow_dispatch) | ||
# Run smoke tests for manual runs against base branch | ||
echo "run_smoke_tests=true" >> $GITHUB_OUTPUT | ||
exit 0 | ||
;; | ||
pull_request_target) | ||
ORG="gptscript-ai" | ||
AUTHOR="${{ github.event.pull_request.user.login }}" | ||
|
||
# Check for org membership | ||
MEMBERSHIP_RESPONSE_CODE=$(curl -s -o /dev/null -w "%{http_code}" -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \ | ||
"https://api.github.com/orgs/$ORG/members/$AUTHOR") | ||
|
||
if [ "$MEMBERSHIP_RESPONSE_CODE" -eq 204 ]; then | ||
echo "run_smoke_tests=true" >> $GITHUB_OUTPUT | ||
exit 0 | ||
fi | ||
|
||
# Check for "run-smoke" label | ||
LABELS=$(curl -s -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \ | ||
"https://api.github.com/repos/${{ github.repository_owner }}/${{ github.event.repository.name }}/issues/${{ github.event.pull_request.number }}/labels" | jq -r '.[].name') | ||
if echo "$LABELS" | grep -q "run-smoke"; then | ||
# Run smoke tests for PR with the "run-smoke" label | ||
echo "run_smoke_tests=true" >> $GITHUB_OUTPUT | ||
exit 0 | ||
fi | ||
|
||
;; | ||
esac | ||
|
||
echo "run_smoke_tests=false" >> $GITHUB_OUTPUT | ||
|
||
smoke-gpt-4o-2024-05-13: | ||
needs: check-label | ||
if: ${{ needs.check-label.outputs.run_smoke_tests == 'true' }} | ||
runs-on: ubuntu-22.04 | ||
steps: | ||
- name: Checkout base repository | ||
uses: actions/checkout@v4 | ||
with: | ||
fetch-depth: 1 | ||
- name: Checkout PR code if running for a PR | ||
if: ${{ github.event_name == 'pull_request_target' }} | ||
uses: actions/checkout@v4 | ||
with: | ||
fetch-depth: 1 | ||
repository: ${{ github.event.pull_request.head.repo.full_name }} | ||
ref: ${{ github.event.pull_request.head.ref }} | ||
- uses: actions/setup-go@v5 | ||
with: | ||
cache: false | ||
go-version: "1.21" | ||
- env: | ||
OPENAI_API_KEY: ${{ secrets.SMOKE_OPENAI_API_KEY }} | ||
GPTSCRIPT_DEFAULT_MODEL: gpt-4o-2024-05-13 | ||
name: Run smoke test for gpt-4o-2024-05-13 | ||
run: | | ||
echo "Running smoke test for model gpt-4o-2024-05-13" | ||
export PATH="$(pwd)/bin:${PATH}" | ||
make smoke | ||
|
||
smoke-gpt-4-turbo-2024-04-09: | ||
needs: check-label | ||
if: ${{ needs.check-label.outputs.run_smoke_tests == 'true' }} | ||
runs-on: ubuntu-22.04 | ||
steps: | ||
- name: Checkout base repository | ||
uses: actions/checkout@v4 | ||
with: | ||
fetch-depth: 1 | ||
- name: Checkout PR code if running for a PR | ||
if: ${{ github.event_name == 'pull_request_target' }} | ||
uses: actions/checkout@v4 | ||
with: | ||
fetch-depth: 1 | ||
repository: ${{ github.event.pull_request.head.repo.full_name }} | ||
ref: ${{ github.event.pull_request.head.ref }} | ||
- uses: actions/setup-go@v5 | ||
with: | ||
cache: false | ||
go-version: "1.21" | ||
- env: | ||
OPENAI_API_KEY: ${{ secrets.SMOKE_OPENAI_API_KEY }} | ||
GPTSCRIPT_DEFAULT_MODEL: gpt-4-turbo-2024-04-09 | ||
name: Run smoke test for gpt-4-turbo-2024-04-09 | ||
run: | | ||
echo "Running smoke test for model gpt-4-turbo-2024-04-09" | ||
export PATH="$(pwd)/bin:${PATH}" | ||
make smoke | ||
|
||
smoke-claude-3-opus-20240229: | ||
needs: check-label | ||
if: ${{ needs.check-label.outputs.run_smoke_tests == 'true' }} | ||
runs-on: ubuntu-22.04 | ||
steps: | ||
- name: Checkout base repository | ||
uses: actions/checkout@v4 | ||
with: | ||
fetch-depth: 1 | ||
- name: Checkout PR code if running for a PR | ||
if: ${{ github.event_name == 'pull_request_target' }} | ||
uses: actions/checkout@v4 | ||
with: | ||
fetch-depth: 1 | ||
repository: ${{ github.event.pull_request.head.repo.full_name }} | ||
ref: ${{ github.event.pull_request.head.ref }} | ||
- uses: actions/setup-go@v5 | ||
with: | ||
cache: false | ||
go-version: "1.21" | ||
- env: | ||
OPENAI_API_KEY: ${{ secrets.SMOKE_OPENAI_API_KEY }} | ||
GPTSCRIPT_DEFAULT_MODEL: claude-3-opus-20240229 from github.com/gptscript-ai/claude3-anthropic-provider@tool-beta | ||
ANTHROPIC_API_KEY: ${{ secrets.SMOKE_ANTHROPIC_API_KEY }} | ||
name: Run smoke test for claude-3-opus-20240229 | ||
run: | | ||
echo "Running smoke test for model claude-3-opus-20240229" | ||
export PATH="$(pwd)/bin:${PATH}" | ||
make smoke | ||
|
||
smoke-mistral-large-2402: | ||
needs: check-label | ||
if: ${{ needs.check-label.outputs.run_smoke_tests == 'true' }} | ||
runs-on: ubuntu-22.04 | ||
steps: | ||
- name: Checkout base repository | ||
uses: actions/checkout@v4 | ||
with: | ||
fetch-depth: 1 | ||
- name: Checkout PR code if running for a PR | ||
if: ${{ github.event_name == 'pull_request_target' }} | ||
uses: actions/checkout@v4 | ||
with: | ||
fetch-depth: 1 | ||
repository: ${{ github.event.pull_request.head.repo.full_name }} | ||
ref: ${{ github.event.pull_request.head.ref }} | ||
- uses: actions/setup-go@v5 | ||
with: | ||
cache: false | ||
go-version: "1.21" | ||
- env: | ||
OPENAI_API_KEY: ${{ secrets.SMOKE_OPENAI_API_KEY }} | ||
GPTSCRIPT_DEFAULT_MODEL: mistral-large-2402 from https://api.mistral.ai/v1 | ||
GPTSCRIPT_PROVIDER_API_MISTRAL_AI_API_KEY: ${{ secrets.SMOKE_GPTSCRIPT_PROVIDER_API_MISTRAL_AI_API_KEY }} | ||
name: Run smoke test for mistral-large-2402 | ||
run: | | ||
echo "Running smoke test for model mistral-large-2402" | ||
export PATH="$(pwd)/bin:${PATH}" | ||
make smoke | ||
|
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,127 @@ | ||
package judge | ||
|
||
import ( | ||
"context" | ||
"encoding/json" | ||
"fmt" | ||
|
||
"github.com/getkin/kin-openapi/openapi3gen" | ||
openai "github.com/gptscript-ai/chat-completion-client" | ||
) | ||
|
||
const instructions = `When given JSON objects that conform to the following JSONSchema: | ||
|
||
%s | ||
|
||
Determine if "actual" is equal to "expected" based on the comparison constraints described by "criteria". | ||
"actual" is considered equal to "expected" if and only if the all of the constraints described by "criteria" are satisfied. | ||
|
||
After making a determination, respond with a JSON object that conforms to the following JSONSchema: | ||
|
||
{ | ||
"name": "ruling", | ||
"type": "object", | ||
"properties": { | ||
"equal": { | ||
"type": "boolean", | ||
"description": "Set to true if and only if actual is considered equal to expected." | ||
}, | ||
"reasoning": { | ||
"type": "string", | ||
"description": "The reasoning used to come to the determination, that points out all instances where the given criteria was violated" | ||
} | ||
}, | ||
"required": [ | ||
"equal", | ||
"reasoning" | ||
] | ||
} | ||
|
||
Your responses are concise and include only the json object described above. | ||
` | ||
|
||
type Judge[T any] struct { | ||
client *openai.Client | ||
instructions string | ||
} | ||
|
||
type comparison[T any] struct { | ||
Expected T `json:"expected"` | ||
Actual T `json:"actual"` | ||
Criteria string `json:"criteria"` | ||
} | ||
|
||
type ruling struct { | ||
Equal bool `json:"equal"` | ||
Reasoning string `json:"reasoning"` | ||
} | ||
|
||
func New[T any](client *openai.Client) (*Judge[T], error) { | ||
schema, err := openapi3gen.NewSchemaRefForValue( | ||
new(comparison[T]), | ||
nil, | ||
openapi3gen.CreateComponentSchemas( | ||
openapi3gen.ExportComponentSchemasOptions{ | ||
ExportComponentSchemas: true, | ||
ExportGenerics: false, | ||
}), | ||
) | ||
if err != nil { | ||
return nil, fmt.Errorf("failed to generate JSONSchema for %T: %w", new(T), err) | ||
} | ||
|
||
schemaJSON, err := json.MarshalIndent(schema, "", " ") | ||
if err != nil { | ||
return nil, fmt.Errorf("failed to marshal JSONSchema for %T: %w", new(T), err) | ||
} | ||
|
||
return &Judge[T]{ | ||
client: client, | ||
instructions: fmt.Sprintf(instructions, schemaJSON), | ||
}, nil | ||
} | ||
|
||
func (j *Judge[T]) Equal(ctx context.Context, expected, actual T, criteria string) (equal bool, reasoning string, err error) { | ||
comparisonJSON, err := json.MarshalIndent(&comparison[T]{ | ||
Expected: expected, | ||
Actual: actual, | ||
Criteria: criteria, | ||
}, "", " ") | ||
if err != nil { | ||
return false, "", fmt.Errorf("failed to marshal judge testcase JSON: %w", err) | ||
} | ||
|
||
request := openai.ChatCompletionRequest{ | ||
Model: openai.GPT4o, | ||
Temperature: new(float32), | ||
N: 1, | ||
ResponseFormat: &openai.ChatCompletionResponseFormat{ | ||
Type: openai.ChatCompletionResponseFormatTypeJSONObject, | ||
}, | ||
Messages: []openai.ChatCompletionMessage{ | ||
{ | ||
Role: openai.ChatMessageRoleSystem, | ||
Content: j.instructions, | ||
}, | ||
{ | ||
Role: openai.ChatMessageRoleUser, | ||
Content: string(comparisonJSON), | ||
}, | ||
}, | ||
} | ||
response, err := j.client.CreateChatCompletion(ctx, request) | ||
if err != nil { | ||
return false, "", fmt.Errorf("failed to make judge chat completion request: %w", err) | ||
} | ||
|
||
if len(response.Choices) < 1 { | ||
return false, "", fmt.Errorf("judge chat completion request returned no choices") | ||
} | ||
|
||
var equality ruling | ||
if err := json.Unmarshal([]byte(response.Choices[0].Message.Content), &equality); err != nil { | ||
return false, "", fmt.Errorf("failed to unmarshal judge ruling: %w", err) | ||
} | ||
|
||
return equality.Equal, equality.Reasoning, nil | ||
} |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.