Skip to content

Commit 0d47e77

Browse files
committed
feat: add model to translate script
1 parent 4490fd7 commit 0d47e77

File tree

7 files changed

+102
-31
lines changed

7 files changed

+102
-31
lines changed

packages/translate/src/chunk.ts

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,22 @@
1+
import type { DeepSeekModel } from './types';
2+
13
// Constants for token estimation
24
export const CHAR_TO_TOKEN_RATIO = 0.3; // 1 English character ≈ 0.3 token
35
export const CHAR_TO_TOKEN_RATIO_ZH = 0.5; // 1 Chinese character ≈ 0.5 token
46
export const MAX_INPUT_TOKENS = 64 * 1024; // DeepSeek's 64K context length
5-
export const MAX_OUTPUT_TOKENS = 8 * 1024; // DeepSeek's 8K max output
7+
8+
// Model-specific output token limits
9+
export const MAX_OUTPUT_TOKENS_CHAT = 8 * 1024; // deepseek-chat: max 8K output
10+
export const MAX_OUTPUT_TOKENS_REASONER = 64 * 1024; // deepseek-reasoner: max 64K output
11+
12+
// Get max output tokens for a specific model
13+
export function getMaxOutputTokens(
14+
model: DeepSeekModel = 'deepseek-chat',
15+
): number {
16+
return model === 'deepseek-reasoner'
17+
? MAX_OUTPUT_TOKENS_REASONER
18+
: MAX_OUTPUT_TOKENS_CHAT;
19+
}
620

721
// Chunk size constants (in estimated tokens)
822
export const MAX_CHUNK_SIZE_TOKENS = 16 * 1024; // Use smaller chunks for better translation quality
@@ -13,12 +27,20 @@ export function estimateTokens(content: string): number {
1327
return Math.ceil(content.length * CHAR_TO_TOKEN_RATIO_ZH);
1428
}
1529

16-
export function needsChunking(content: string): boolean {
17-
return estimateTokens(content) > MAX_OUTPUT_TOKENS;
30+
export function needsChunking(
31+
content: string,
32+
model: DeepSeekModel = 'deepseek-chat',
33+
): boolean {
34+
return estimateTokens(content) > getMaxOutputTokens(model);
1835
}
1936

2037
// Split text into chunks that respect markdown structure and heading hierarchy
21-
export function splitIntoChunks(content: string): string[] {
38+
export function splitIntoChunks(
39+
content: string,
40+
model: DeepSeekModel = 'deepseek-chat',
41+
): string[] {
42+
const maxOutputTokens = getMaxOutputTokens(model);
43+
2244
// Define a regex pattern for markdown headings (## Heading)
2345
const headingPattern = /^(#{2,}) /gm;
2446

@@ -65,7 +87,7 @@ export function splitIntoChunks(content: string): string[] {
6587
for (const section of sections) {
6688
const sectionTokens = estimateTokens(section);
6789

68-
if (currentTokens + sectionTokens > MAX_OUTPUT_TOKENS) {
90+
if (currentTokens + sectionTokens > maxOutputTokens) {
6991
// If adding this section would exceed the limit, start a new chunk
7092
chunks.push(currentChunk);
7193
currentChunk = section;

packages/translate/src/index.ts

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ import { Command } from 'commander';
44
import { getConfig } from './config';
55
import { logger } from './logger';
66
import { main } from './main';
7-
import type { MainConfig } from './types';
7+
import type { DeepSeekModel, MainConfig } from './types';
88

99
export type Config = MainConfig | MainConfig[];
1010

@@ -41,6 +41,10 @@ program
4141
'--concurrency <number>',
4242
'Number of concurrent translation tasks (default: 10)',
4343
)
44+
.option(
45+
'-m, --model <model>',
46+
'DeepSeek model to use: "deepseek-chat" or "deepseek-reasoner" (default: "deepseek-chat")',
47+
)
4448
.action(
4549
async (options: {
4650
config?: string;
@@ -51,6 +55,7 @@ program
5155
targetLanguage?: string;
5256
max?: number;
5357
concurrency?: number;
58+
model?: DeepSeekModel;
5459
}) => {
5560
if (options.verbose) {
5661
logger.setVerbose(true);
@@ -66,6 +71,7 @@ program
6671
...(options.docsPath ? { docsPath: options.docsPath } : {}),
6772
...(options.max ? { max: options.max } : {}),
6873
...(options.concurrency ? { concurrency: options.concurrency } : {}),
74+
...(options.model ? { model: options.model } : {}),
6975
verbose: options.verbose,
7076
listOnly: options.listOnly,
7177
targetLanguage: options.targetLanguage,

packages/translate/src/main.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ export async function main({
2424
targetLanguage,
2525
concurrency = 10,
2626
verbose,
27+
model = 'deepseek-chat',
2728
}: MainConfig): Promise<void> {
2829
// Filter languages based on targetLanguage if specified
2930
const filteredLangs = targetLanguage
@@ -211,6 +212,7 @@ export async function main({
211212
targetPath: task.targetPath,
212213
langConfig,
213214
docsContext,
215+
model,
214216
});
215217

216218
completedRefDocs++;

packages/translate/src/openai.ts

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
11
import OpenAI from 'openai';
22
import type { ChatCompletionMessageParam } from 'openai/resources.mjs';
33
import {
4-
MAX_OUTPUT_TOKENS,
54
estimateTokens,
5+
getMaxOutputTokens,
66
needsChunking,
77
splitIntoChunks,
88
} from './chunk';
99
import { logger } from './logger';
10+
import type { DeepSeekModel } from './types';
1011
import { type Usage, addUsage } from './usage';
1112

1213
interface LangConfig {
@@ -18,6 +19,7 @@ interface TranslateDocumentParams {
1819
content: string;
1920
langConfig: LangConfig;
2021
context?: string;
22+
model?: DeepSeekModel;
2123
}
2224

2325
// Initialize OpenAI client if API key is available
@@ -46,6 +48,7 @@ async function translateChunk(
4648
langConfig: LangConfig,
4749
context: string,
4850
needsFrontmatterRules = true,
51+
modelName: DeepSeekModel = 'deepseek-chat',
4952
): Promise<string> {
5053
if (!openai) {
5154
throw new Error('OPENAI_API_KEY is not set.');
@@ -341,9 +344,9 @@ The next message contains the COMPLETE original text that needs to be translated
341344
// console.log(chunk);
342345

343346
const response = await openai.chat.completions.create({
344-
model: model,
345-
max_completion_tokens: MAX_OUTPUT_TOKENS,
346-
max_tokens: MAX_OUTPUT_TOKENS,
347+
model: modelName,
348+
max_completion_tokens: getMaxOutputTokens(modelName),
349+
max_tokens: getMaxOutputTokens(modelName),
347350
messages: messages,
348351
});
349352

@@ -363,6 +366,7 @@ export async function $translateDocument({
363366
content,
364367
langConfig,
365368
context = '',
369+
model: modelName = 'deepseek-chat',
366370
}: TranslateDocumentParams): Promise<string> {
367371
if (!openai) {
368372
throw new Error('OPENAI_API_KEY is not set.');
@@ -374,14 +378,14 @@ export async function $translateDocument({
374378
);
375379

376380
// For small documents, use the direct approach
377-
if (!needsChunking(content)) {
378-
return await translateChunk(content, langConfig, context, true);
381+
if (!needsChunking(content, modelName)) {
382+
return await translateChunk(content, langConfig, context, true, modelName);
379383
}
380384

381385
logger.debug(
382386
'Document is large, splitting into chunks for multi-round translation',
383387
);
384-
const chunks = splitIntoChunks(content);
388+
const chunks = splitIntoChunks(content, modelName);
385389
logger.debug(`Split document into ${chunks.length} chunks`);
386390

387391
let translatedContent = '';
@@ -395,6 +399,7 @@ export async function $translateDocument({
395399
langConfig,
396400
context,
397401
i === 0,
402+
modelName,
398403
);
399404

400405
// Add to the complete translated content

packages/translate/src/types.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
export type DeepSeekModel = 'deepseek-chat' | 'deepseek-reasoner';
2+
13
export interface LangConfig {
24
locale: string;
35
name: string;
@@ -30,6 +32,7 @@ export interface MainConfig {
3032
targetLanguage?: string;
3133
concurrency?: number;
3234
verbose?: boolean;
35+
model?: DeepSeekModel;
3336
}
3437

3538
export interface TranslationResult {

packages/translate/src/utils.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import matter from 'gray-matter';
66
import { needsChunking, splitIntoChunks } from './chunk';
77
import { logger } from './logger';
88
import { $translateDocument } from './openai';
9+
import type { DeepSeekModel } from './types';
910

1011
interface LangConfig {
1112
name: string;
@@ -167,13 +168,15 @@ interface TranslateDocumentFileParams {
167168
targetPath: string;
168169
langConfig: LangConfig;
169170
docsContext?: string;
171+
model?: DeepSeekModel;
170172
}
171173

172174
export async function translateDoc({
173175
sourcePath,
174176
targetPath,
175177
langConfig,
176178
docsContext,
179+
model = 'deepseek-chat',
177180
}: TranslateDocumentFileParams) {
178181
// Create directory if it doesn't exist
179182
logger.debug(`Translating ${sourcePath} to ${targetPath}`);
@@ -192,6 +195,7 @@ export async function translateDoc({
192195
content: sourceContent,
193196
langConfig,
194197
context: translationContext,
198+
model,
195199
});
196200

197201
// Format as ISO strings (UTC)

packages/translate/tests/unit/chunk.test.ts

Lines changed: 47 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,27 @@
1-
import { describe, expect, it, vi } from 'vitest';
1+
import { describe, expect, it } from 'vitest';
22
import {
3-
CHAR_TO_TOKEN_RATIO,
43
CHAR_TO_TOKEN_RATIO_ZH,
5-
MAX_CHUNK_SIZE_TOKENS,
6-
MAX_INPUT_TOKENS,
7-
MAX_OUTPUT_TOKENS,
4+
MAX_OUTPUT_TOKENS_CHAT,
5+
MAX_OUTPUT_TOKENS_REASONER,
86
estimateTokens,
7+
getMaxOutputTokens,
98
needsChunking,
109
splitIntoChunks,
1110
} from '../../src/chunk';
1211

1312
describe('chunk', () => {
13+
describe('getMaxOutputTokens', () => {
14+
it('should return correct token limits for deepseek-chat', () => {
15+
expect(getMaxOutputTokens('deepseek-chat')).toBe(MAX_OUTPUT_TOKENS_CHAT);
16+
});
17+
18+
it('should return correct token limits for deepseek-reasoner', () => {
19+
expect(getMaxOutputTokens('deepseek-reasoner')).toBe(
20+
MAX_OUTPUT_TOKENS_REASONER,
21+
);
22+
});
23+
});
24+
1425
describe('estimateTokens', () => {
1526
it('should estimate tokens based on content length', () => {
1627
// Create test strings of different lengths
@@ -34,23 +45,41 @@ describe('chunk', () => {
3445
});
3546

3647
describe('needsChunking', () => {
37-
it('should return true for content exceeding MAX_OUTPUT_TOKENS', () => {
38-
// Create a string that would exceed the MAX_OUTPUT_TOKENS
39-
// MAX_OUTPUT_TOKENS / CHAR_TO_TOKEN_RATIO_ZH gives us the number of characters needed
48+
it('should return true for content exceeding MAX_OUTPUT_TOKENS for deepseek-chat', () => {
49+
// Create a string that would exceed the MAX_OUTPUT_TOKENS_CHAT
50+
// MAX_OUTPUT_TOKENS_CHAT / CHAR_TO_TOKEN_RATIO_ZH gives us the number of characters needed
51+
const exceedMaxTokens = 'a'.repeat(
52+
Math.ceil(MAX_OUTPUT_TOKENS_CHAT / CHAR_TO_TOKEN_RATIO_ZH) + 1000,
53+
);
54+
55+
expect(needsChunking(exceedMaxTokens, 'deepseek-chat')).toBe(true);
56+
});
57+
58+
it('should return false for content within MAX_OUTPUT_TOKENS for deepseek-chat', () => {
59+
// Create a string that would be below the MAX_OUTPUT_TOKENS_CHAT
60+
const withinMaxTokens = 'a'.repeat(
61+
Math.ceil(MAX_OUTPUT_TOKENS_CHAT / CHAR_TO_TOKEN_RATIO_ZH / 2),
62+
);
63+
64+
expect(needsChunking(withinMaxTokens, 'deepseek-chat')).toBe(false);
65+
});
66+
67+
it('should return true for content exceeding MAX_OUTPUT_TOKENS for deepseek-reasoner', () => {
68+
// Create a string that would exceed the MAX_OUTPUT_TOKENS_REASONER
4069
const exceedMaxTokens = 'a'.repeat(
41-
Math.ceil(MAX_OUTPUT_TOKENS / CHAR_TO_TOKEN_RATIO_ZH) + 1000,
70+
Math.ceil(MAX_OUTPUT_TOKENS_REASONER / CHAR_TO_TOKEN_RATIO_ZH) + 1000,
4271
);
4372

44-
expect(needsChunking(exceedMaxTokens)).toBe(true);
73+
expect(needsChunking(exceedMaxTokens, 'deepseek-reasoner')).toBe(true);
4574
});
4675

47-
it('should return false for content within MAX_OUTPUT_TOKENS', () => {
48-
// Create a string that would be below the MAX_OUTPUT_TOKENS
76+
it('should return false for content within MAX_OUTPUT_TOKENS for deepseek-reasoner', () => {
77+
// Create a string that would be below the MAX_OUTPUT_TOKENS_REASONER
4978
const withinMaxTokens = 'a'.repeat(
50-
Math.ceil(MAX_OUTPUT_TOKENS / CHAR_TO_TOKEN_RATIO_ZH / 2),
79+
Math.ceil(MAX_OUTPUT_TOKENS_REASONER / CHAR_TO_TOKEN_RATIO_ZH / 2),
5180
);
5281

53-
expect(needsChunking(withinMaxTokens)).toBe(false);
82+
expect(needsChunking(withinMaxTokens, 'deepseek-reasoner')).toBe(false);
5483
});
5584
});
5685

@@ -72,7 +101,7 @@ Content for section 2.
72101
73102
More content.`;
74103

75-
const chunks = splitIntoChunks(content);
104+
const chunks = splitIntoChunks(content, 'deepseek-chat');
76105

77106
// The current implementation doesn't split by markdown headings as expected
78107
// so we're testing the actual behavior
@@ -87,10 +116,10 @@ More content.`;
87116
it('should handle large sections with the current implementation', () => {
88117
// Create a very large section without headings
89118
const largeSection = 'a'.repeat(
90-
Math.ceil((MAX_OUTPUT_TOKENS / CHAR_TO_TOKEN_RATIO_ZH) * 3),
119+
Math.ceil((MAX_OUTPUT_TOKENS_CHAT / CHAR_TO_TOKEN_RATIO_ZH) * 3),
91120
);
92121

93-
const chunks = splitIntoChunks(largeSection);
122+
const chunks = splitIntoChunks(largeSection, 'deepseek-chat');
94123

95124
// The current implementation returns a single large chunk
96125
expect(chunks.length).toBeGreaterThanOrEqual(1);
@@ -100,7 +129,7 @@ More content.`;
100129
});
101130

102131
it('should handle empty content', () => {
103-
const chunks = splitIntoChunks('');
132+
const chunks = splitIntoChunks('', 'deepseek-chat');
104133
expect(chunks).toEqual([]);
105134
});
106135
});

0 commit comments

Comments
 (0)