feat: add model to translate script

xiaoyu2er · xiaoyu2er · commit 0d47e77fe326 · 2025-06-08T16:45:44.000-07:00
diff --git a/packages/translate/src/chunk.ts b/packages/translate/src/chunk.ts
@@ -1,8 +1,22 @@
+import type { DeepSeekModel } from './types';
+
 // Constants for token estimation
 export const CHAR_TO_TOKEN_RATIO = 0.3; // 1 English character ≈ 0.3 token
 export const CHAR_TO_TOKEN_RATIO_ZH = 0.5; // 1 Chinese character ≈ 0.5 token
 export const MAX_INPUT_TOKENS = 64 * 1024; // DeepSeek's 64K context length
-export const MAX_OUTPUT_TOKENS = 8 * 1024; // DeepSeek's 8K max output
+
+// Model-specific output token limits
+export const MAX_OUTPUT_TOKENS_CHAT = 8 * 1024; // deepseek-chat: max 8K output
+export const MAX_OUTPUT_TOKENS_REASONER = 64 * 1024; // deepseek-reasoner: max 64K output
+
+// Get max output tokens for a specific model
+export function getMaxOutputTokens(
+  model: DeepSeekModel = 'deepseek-chat',
+): number {
+  return model === 'deepseek-reasoner'
+    ? MAX_OUTPUT_TOKENS_REASONER
+    : MAX_OUTPUT_TOKENS_CHAT;
+}
 
 // Chunk size constants (in estimated tokens)
 export const MAX_CHUNK_SIZE_TOKENS = 16 * 1024; // Use smaller chunks for better translation quality
@@ -13,12 +27,20 @@ export function estimateTokens(content: string): number {
   return Math.ceil(content.length * CHAR_TO_TOKEN_RATIO_ZH);
 }
 
-export function needsChunking(content: string): boolean {
-  return estimateTokens(content) > MAX_OUTPUT_TOKENS;
+export function needsChunking(
+  content: string,
+  model: DeepSeekModel = 'deepseek-chat',
+): boolean {
+  return estimateTokens(content) > getMaxOutputTokens(model);
 }
 
 // Split text into chunks that respect markdown structure and heading hierarchy
-export function splitIntoChunks(content: string): string[] {
+export function splitIntoChunks(
+  content: string,
+  model: DeepSeekModel = 'deepseek-chat',
+): string[] {
+  const maxOutputTokens = getMaxOutputTokens(model);
+
   // Define a regex pattern for markdown headings (## Heading)
   const headingPattern = /^(#{2,}) /gm;
 
@@ -65,7 +87,7 @@ export function splitIntoChunks(content: string): string[] {
   for (const section of sections) {
     const sectionTokens = estimateTokens(section);
 
-    if (currentTokens + sectionTokens > MAX_OUTPUT_TOKENS) {
+    if (currentTokens + sectionTokens > maxOutputTokens) {
       // If adding this section would exceed the limit, start a new chunk
       chunks.push(currentChunk);
       currentChunk = section;
diff --git a/packages/translate/src/index.ts b/packages/translate/src/index.ts
@@ -4,7 +4,7 @@ import { Command } from 'commander';
 import { getConfig } from './config';
 import { logger } from './logger';
 import { main } from './main';
-import type { MainConfig } from './types';
+import type { DeepSeekModel, MainConfig } from './types';
 
 export type Config = MainConfig | MainConfig[];
 
@@ -41,6 +41,10 @@ program
     '--concurrency <number>',
     'Number of concurrent translation tasks (default: 10)',
   )
+  .option(
+    '-m, --model <model>',
+    'DeepSeek model to use: "deepseek-chat" or "deepseek-reasoner" (default: "deepseek-chat")',
+  )
   .action(
     async (options: {
       config?: string;
@@ -51,6 +55,7 @@ program
       targetLanguage?: string;
       max?: number;
       concurrency?: number;
+      model?: DeepSeekModel;
     }) => {
       if (options.verbose) {
         logger.setVerbose(true);
@@ -66,6 +71,7 @@ program
           ...(options.docsPath ? { docsPath: options.docsPath } : {}),
           ...(options.max ? { max: options.max } : {}),
           ...(options.concurrency ? { concurrency: options.concurrency } : {}),
+          ...(options.model ? { model: options.model } : {}),
           verbose: options.verbose,
           listOnly: options.listOnly,
           targetLanguage: options.targetLanguage,
diff --git a/packages/translate/src/main.ts b/packages/translate/src/main.ts
@@ -24,6 +24,7 @@ export async function main({
   targetLanguage,
   concurrency = 10,
   verbose,
+  model = 'deepseek-chat',
 }: MainConfig): Promise<void> {
   // Filter languages based on targetLanguage if specified
   const filteredLangs = targetLanguage
@@ -211,6 +212,7 @@ export async function main({
             targetPath: task.targetPath,
             langConfig,
             docsContext,
+            model,
           });
 
           completedRefDocs++;
diff --git a/packages/translate/src/openai.ts b/packages/translate/src/openai.ts
@@ -1,12 +1,13 @@
 import OpenAI from 'openai';
 import type { ChatCompletionMessageParam } from 'openai/resources.mjs';
 import {
-  MAX_OUTPUT_TOKENS,
   estimateTokens,
+  getMaxOutputTokens,
   needsChunking,
   splitIntoChunks,
 } from './chunk';
 import { logger } from './logger';
+import type { DeepSeekModel } from './types';
 import { type Usage, addUsage } from './usage';
 
 interface LangConfig {
@@ -18,6 +19,7 @@ interface TranslateDocumentParams {
   content: string;
   langConfig: LangConfig;
   context?: string;
+  model?: DeepSeekModel;
 }
 
 // Initialize OpenAI client if API key is available
@@ -46,6 +48,7 @@ async function translateChunk(
   langConfig: LangConfig,
   context: string,
   needsFrontmatterRules = true,
+  modelName: DeepSeekModel = 'deepseek-chat',
 ): Promise<string> {
   if (!openai) {
     throw new Error('OPENAI_API_KEY is not set.');
@@ -341,9 +344,9 @@ The next message contains the COMPLETE original text that needs to be translated
   // console.log(chunk);
 
   const response = await openai.chat.completions.create({
-    model: model,
-    max_completion_tokens: MAX_OUTPUT_TOKENS,
-    max_tokens: MAX_OUTPUT_TOKENS,
+    model: modelName,
+    max_completion_tokens: getMaxOutputTokens(modelName),
+    max_tokens: getMaxOutputTokens(modelName),
     messages: messages,
   });
 
@@ -363,6 +366,7 @@ export async function $translateDocument({
   content,
   langConfig,
   context = '',
+  model: modelName = 'deepseek-chat',
 }: TranslateDocumentParams): Promise<string> {
   if (!openai) {
     throw new Error('OPENAI_API_KEY is not set.');
@@ -374,14 +378,14 @@ export async function $translateDocument({
   );
 
   // For small documents, use the direct approach
-  if (!needsChunking(content)) {
-    return await translateChunk(content, langConfig, context, true);
+  if (!needsChunking(content, modelName)) {
+    return await translateChunk(content, langConfig, context, true, modelName);
   }
 
   logger.debug(
     'Document is large, splitting into chunks for multi-round translation',
   );
-  const chunks = splitIntoChunks(content);
+  const chunks = splitIntoChunks(content, modelName);
   logger.debug(`Split document into ${chunks.length} chunks`);
 
   let translatedContent = '';
@@ -395,6 +399,7 @@ export async function $translateDocument({
       langConfig,
       context,
       i === 0,
+      modelName,
     );
 
     // Add to the complete translated content
diff --git a/packages/translate/src/types.ts b/packages/translate/src/types.ts
@@ -1,3 +1,5 @@
+export type DeepSeekModel = 'deepseek-chat' | 'deepseek-reasoner';
+
 export interface LangConfig {
   locale: string;
   name: string;
@@ -30,6 +32,7 @@ export interface MainConfig {
   targetLanguage?: string;
   concurrency?: number;
   verbose?: boolean;
+  model?: DeepSeekModel;
 }
 
 export interface TranslationResult {
diff --git a/packages/translate/src/utils.ts b/packages/translate/src/utils.ts
@@ -6,6 +6,7 @@ import matter from 'gray-matter';
 import { needsChunking, splitIntoChunks } from './chunk';
 import { logger } from './logger';
 import { $translateDocument } from './openai';
+import type { DeepSeekModel } from './types';
 
 interface LangConfig {
   name: string;
@@ -167,13 +168,15 @@ interface TranslateDocumentFileParams {
   targetPath: string;
   langConfig: LangConfig;
   docsContext?: string;
+  model?: DeepSeekModel;
 }
 
 export async function translateDoc({
   sourcePath,
   targetPath,
   langConfig,
   docsContext,
+  model = 'deepseek-chat',
 }: TranslateDocumentFileParams) {
   // Create directory if it doesn't exist
   logger.debug(`Translating ${sourcePath} to ${targetPath}`);
@@ -192,6 +195,7 @@ export async function translateDoc({
     content: sourceContent,
     langConfig,
     context: translationContext,
+    model,
   });
 
   // Format as ISO strings (UTC)
diff --git a/packages/translate/tests/unit/chunk.test.ts b/packages/translate/tests/unit/chunk.test.ts
@@ -1,16 +1,27 @@
-import { describe, expect, it, vi } from 'vitest';
+import { describe, expect, it } from 'vitest';
 import {
-  CHAR_TO_TOKEN_RATIO,
   CHAR_TO_TOKEN_RATIO_ZH,
-  MAX_CHUNK_SIZE_TOKENS,
-  MAX_INPUT_TOKENS,
-  MAX_OUTPUT_TOKENS,
+  MAX_OUTPUT_TOKENS_CHAT,
+  MAX_OUTPUT_TOKENS_REASONER,
   estimateTokens,
+  getMaxOutputTokens,
   needsChunking,
   splitIntoChunks,
 } from '../../src/chunk';
 
 describe('chunk', () => {
+  describe('getMaxOutputTokens', () => {
+    it('should return correct token limits for deepseek-chat', () => {
+      expect(getMaxOutputTokens('deepseek-chat')).toBe(MAX_OUTPUT_TOKENS_CHAT);
+    });
+
+    it('should return correct token limits for deepseek-reasoner', () => {
+      expect(getMaxOutputTokens('deepseek-reasoner')).toBe(
+        MAX_OUTPUT_TOKENS_REASONER,
+      );
+    });
+  });
+
   describe('estimateTokens', () => {
     it('should estimate tokens based on content length', () => {
       // Create test strings of different lengths
@@ -34,23 +45,41 @@ describe('chunk', () => {
   });
 
   describe('needsChunking', () => {
-    it('should return true for content exceeding MAX_OUTPUT_TOKENS', () => {
-      // Create a string that would exceed the MAX_OUTPUT_TOKENS
-      // MAX_OUTPUT_TOKENS / CHAR_TO_TOKEN_RATIO_ZH gives us the number of characters needed
+    it('should return true for content exceeding MAX_OUTPUT_TOKENS for deepseek-chat', () => {
+      // Create a string that would exceed the MAX_OUTPUT_TOKENS_CHAT
+      // MAX_OUTPUT_TOKENS_CHAT / CHAR_TO_TOKEN_RATIO_ZH gives us the number of characters needed
+      const exceedMaxTokens = 'a'.repeat(
+        Math.ceil(MAX_OUTPUT_TOKENS_CHAT / CHAR_TO_TOKEN_RATIO_ZH) + 1000,
+      );
+
+      expect(needsChunking(exceedMaxTokens, 'deepseek-chat')).toBe(true);
+    });
+
+    it('should return false for content within MAX_OUTPUT_TOKENS for deepseek-chat', () => {
+      // Create a string that would be below the MAX_OUTPUT_TOKENS_CHAT
+      const withinMaxTokens = 'a'.repeat(
+        Math.ceil(MAX_OUTPUT_TOKENS_CHAT / CHAR_TO_TOKEN_RATIO_ZH / 2),
+      );
+
+      expect(needsChunking(withinMaxTokens, 'deepseek-chat')).toBe(false);
+    });
+
+    it('should return true for content exceeding MAX_OUTPUT_TOKENS for deepseek-reasoner', () => {
+      // Create a string that would exceed the MAX_OUTPUT_TOKENS_REASONER
       const exceedMaxTokens = 'a'.repeat(
-        Math.ceil(MAX_OUTPUT_TOKENS / CHAR_TO_TOKEN_RATIO_ZH) + 1000,
+        Math.ceil(MAX_OUTPUT_TOKENS_REASONER / CHAR_TO_TOKEN_RATIO_ZH) + 1000,
       );
 
-      expect(needsChunking(exceedMaxTokens)).toBe(true);
+      expect(needsChunking(exceedMaxTokens, 'deepseek-reasoner')).toBe(true);
     });
 
-    it('should return false for content within MAX_OUTPUT_TOKENS', () => {
-      // Create a string that would be below the MAX_OUTPUT_TOKENS
+    it('should return false for content within MAX_OUTPUT_TOKENS for deepseek-reasoner', () => {
+      // Create a string that would be below the MAX_OUTPUT_TOKENS_REASONER
       const withinMaxTokens = 'a'.repeat(
-        Math.ceil(MAX_OUTPUT_TOKENS / CHAR_TO_TOKEN_RATIO_ZH / 2),
+        Math.ceil(MAX_OUTPUT_TOKENS_REASONER / CHAR_TO_TOKEN_RATIO_ZH / 2),
       );
 
-      expect(needsChunking(withinMaxTokens)).toBe(false);
+      expect(needsChunking(withinMaxTokens, 'deepseek-reasoner')).toBe(false);
     });
   });
 
@@ -72,7 +101,7 @@ Content for section 2.
 
 More content.`;
 
-      const chunks = splitIntoChunks(content);
+      const chunks = splitIntoChunks(content, 'deepseek-chat');
 
       // The current implementation doesn't split by markdown headings as expected
       // so we're testing the actual behavior
@@ -87,10 +116,10 @@ More content.`;
     it('should handle large sections with the current implementation', () => {
       // Create a very large section without headings
       const largeSection = 'a'.repeat(
-        Math.ceil((MAX_OUTPUT_TOKENS / CHAR_TO_TOKEN_RATIO_ZH) * 3),
+        Math.ceil((MAX_OUTPUT_TOKENS_CHAT / CHAR_TO_TOKEN_RATIO_ZH) * 3),
       );
 
-      const chunks = splitIntoChunks(largeSection);
+      const chunks = splitIntoChunks(largeSection, 'deepseek-chat');
 
       // The current implementation returns a single large chunk
       expect(chunks.length).toBeGreaterThanOrEqual(1);
@@ -100,7 +129,7 @@ More content.`;
     });
 
     it('should handle empty content', () => {
-      const chunks = splitIntoChunks('');
+      const chunks = splitIntoChunks('', 'deepseek-chat');
       expect(chunks).toEqual([]);
     });
   });