Skip to content

Commit 7261137

Browse files
committed
add custom corpora
1 parent 2485f9e commit 7261137

File tree

4 files changed

+275
-26
lines changed

4 files changed

+275
-26
lines changed

Examples/ServiceExamples/Scripts/ExampleSpeechToText.cs

Lines changed: 50 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -27,12 +27,16 @@ public class ExampleSpeechToText : MonoBehaviour
2727
private SpeechToText m_SpeechToText = new SpeechToText();
2828

2929
private string m_CreatedCustomizationID;
30+
private string m_CreatedCorpusName = "unity-corpus";
31+
private string m_CustomCorpusFilePath;
3032

3133
void Start()
3234
{
3335
//m_SpeechToText.Recognize(m_AudioClip, HandleOnRecognize);
3436
LogSystem.InstallDefaultReactors();
3537

38+
m_CustomCorpusFilePath = Application.dataPath + "/Watson/Examples/ServiceExamples/TestData/test-stt-corpus.txt";
39+
3640
// test GetModels and GetModel
3741
//TestGetModels();
3842

@@ -103,13 +107,19 @@ private void TestResetCustomization(string customizationID)
103107
private void TestGetCustomCorpora(string customizationID)
104108
{
105109
Log.Debug("ExampleSpeechToText", "Attempting to get custom corpora for {0}", customizationID);
106-
m_SpeechToText.GetCustomCorpora(HandleGetCustopmCorpora, customizationID);
110+
m_SpeechToText.GetCustomCorpora(HandleGetCustomCorpora, customizationID);
107111
}
108112

109-
private void TestDeleteCustomCorpora(string customizationID, string corpusName)
113+
private void TestDeleteCustomCorpus(string customizationID, string corpusName)
110114
{
111-
Log.Debug("ExampleSpeechToText", "Attempting to delete custom corpora {1} in customization {0}", customizationID, corpusName);
112-
m_SpeechToText.DeleteCustomCorpora(HandleDeleteCustomCorpora, customizationID, corpusName);
115+
Log.Debug("ExampleSpeechToText", "Attempting to delete custom corpus {1} in customization {0}", customizationID, corpusName);
116+
m_SpeechToText.DeleteCustomCorpus(HandleDeleteCustomCorpus, customizationID, corpusName);
117+
}
118+
119+
private void TestAddCustomCorpus(string customizationID, string corpusName, bool allowOverwrite, string trainingDataPath)
120+
{
121+
Log.Debug("ExampleSpeechToText", "Attempting to add custom corpus {1} in customization {0}", customizationID, corpusName);
122+
m_SpeechToText.AddCustomCorpus(HandleAddCustomCorpus, customizationID, corpusName, allowOverwrite, trainingDataPath);
113123
}
114124

115125
private void HandleGetModels(Model[] models)
@@ -232,8 +242,9 @@ private void HandleGetCustomization(Customization customization, string customDa
232242
Log.Debug("ExampleSpeechToText", "Customization - name: {0} | description: {1} | status: {2}", customization.name, customization.description, customization.status);
233243
Log.Debug("ExampleSpeechToText", "GetCustomization() succeeded!");
234244

235-
// test get custom corpora
236-
TestGetCustomCorpora(m_CreatedCustomizationID);
245+
// test add custom corpora
246+
//TestAddCustomCorpus(m_CreatedCustomizationID, m_CreatedCorpusName, true, m_CustomCorpusFilePath);
247+
TestDeleteCustomization(m_CreatedCustomizationID);
237248
}
238249
else
239250
{
@@ -250,6 +261,8 @@ private void HandleTrainCustomization(bool success, string customData)
250261
{
251262
Log.Debug("ExampleSpeechToText", "Train customization {0}!", m_CreatedCustomizationID);
252263
Log.Debug("ExampleSpeechToText", "TrainCustomization() succeeded!");
264+
265+
TestResetCustomization(m_CreatedCustomizationID);
253266
}
254267
else
255268
{
@@ -266,6 +279,7 @@ private void HandleUpgradeCustomization(bool success, string customData)
266279
{
267280
Log.Debug("ExampleSpeechToText", "Upgrade customization {0}!", m_CreatedCustomizationID);
268281
Log.Debug("ExampleSpeechToText", "UpgradeCustomization() succeeded!");
282+
269283
}
270284
else
271285
{
@@ -282,14 +296,17 @@ private void HandleResetCustomization(bool success, string customData)
282296
{
283297
Log.Debug("ExampleSpeechToText", "Reset customization {0}!", m_CreatedCustomizationID);
284298
Log.Debug("ExampleSpeechToText", "ResetCustomization() succeeded!");
299+
300+
// test delete custom corpus
301+
TestDeleteCustomCorpus(m_CreatedCustomizationID, m_CreatedCorpusName);
285302
}
286303
else
287304
{
288305
Log.Debug("ExampleSpeechToText", "Failed to reset customization!");
289306
}
290307
}
291308

292-
private void HandleGetCustopmCorpora(Corpora corpora, string customData)
309+
private void HandleGetCustomCorpora(Corpora corpora, string customData)
293310
{
294311
if (!string.IsNullOrEmpty(customData))
295312
Log.Debug("ExampleSpeechToText", "CustomData: {0}", customData);
@@ -301,6 +318,9 @@ private void HandleGetCustopmCorpora(Corpora corpora, string customData)
301318
foreach (Corpus corpus in corpora.corpora)
302319
Log.Debug("ExampleSpeechToText", "Corpus - name: {0} | total_words: {1} | out_of_vocabulary_words: {2} | staus: {3}",
303320
corpus.name, corpus.total_words, corpus.out_of_vocabulary_words, corpus.status);
321+
322+
//TestUpgradeCustomization(m_CreatedCustomizationID);
323+
TestTrainCustomization(m_CreatedCustomizationID);
304324
}
305325
else
306326
{
@@ -316,18 +336,37 @@ private void HandleGetCustopmCorpora(Corpora corpora, string customData)
316336
}
317337
}
318338

319-
private void HandleDeleteCustomCorpora(bool success, string customData)
339+
private void HandleDeleteCustomCorpus(bool success, string customData)
340+
{
341+
if (!string.IsNullOrEmpty(customData))
342+
Log.Debug("ExampleSpeechToText", "custom data: {0}", customData);
343+
344+
if (success)
345+
{
346+
Log.Debug("ExampleSpeechToText", "DeleteCustomCorpus() succeeded!");
347+
348+
}
349+
else
350+
{
351+
Log.Debug("ExampleSpeechToText", "Failed to delete custom corpus!");
352+
}
353+
}
354+
355+
private void HandleAddCustomCorpus(bool success, string customData)
320356
{
321357
if (!string.IsNullOrEmpty(customData))
322358
Log.Debug("ExampleSpeechToText", "custom data: {0}", customData);
323359

324360
if (success)
325361
{
326-
Log.Debug("ExampleSpeechToText", "DeleteCustomCorpora() succeeded!");
362+
Log.Debug("ExampleSpeechToText", "AddCustomCorpus() succeeded!");
363+
364+
// test get custom corpora
365+
TestGetCustomCorpora(m_CreatedCustomizationID);
327366
}
328367
else
329368
{
330-
Log.Debug("ExampleSpeechToText", "Failed to delete custom corpora!");
369+
Log.Debug("ExampleSpeechToText", "Failed to delete custom corpus!");
331370
}
332371
}
333-
}
372+
}
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
Adds a single corpus text file of new training data to the custom language model.
2+
Use multiple requests to submit multiple corpus text files.
3+
Only the owner of a custom model can use this method to add a corpus to the model.
4+
5+
Submit a plain text file that contains sample sentences from the domain of interest to enable the service to extract words in context.
6+
The more sentences you add that represent the context in which speakers use words from the domain, the better the service's recognition accuracy.
7+
Adding a corpus does not affect the custom model until you train the model for the new data by using the Train a custom model method.
8+
9+
Use the following guidelines to prepare a corpus text file:
10+
11+
Provide a plain text file that is encoded in UTF-8 if it contains non-ASCII characters.
12+
The service assumes UTF-8 encoding if it encounters such characters.
13+
14+
Include each sentence of the corpus on its own line, terminating each line with a carriage return.
15+
Including multiple sentences on the same line can degrade accuracy.
16+
17+
Use consistent capitalization for words in the corpus.
18+
The words resource is case-sensitive; mix upper- and lowercase letters and use capitalization only when intended.
19+
20+
Beware of typographical errors.
21+
The service assumes that typos are new words; unless you correct them before training the model, the service adds them to the model's vocabulary.
22+
23+
The service automatically does the following:
24+
25+
Converts numbers to their equivalent words.
26+
For example, 500 becomes five hundred, and 0.15 becomes zero point fifteen.
27+
28+
Removes the following punctuation and special characters:
29+
30+
! @ # $ % ^ & * - + = ~ _ . , ; : ( ) < > [ ] { }
31+
32+
Ignores phrases enclosed in ( ) (parentheses), < > (angle brackets), [ ] (square brackets), and { } (curly braces).
33+
34+
Converts tokens that include certain symbols to meaningful strings.
35+
For example, the service
36+
37+
Converts a $ (dollar sign) followed by a number to its string representation.
38+
For example, $100 becomes one hundred dollars.
39+
40+
Converts a % (percent sign) preceded by a number to its string representation.
41+
For example, 100% becomes one hundred percent.
42+
43+
This list is not exhaustive; the service makes similar adjustments for other characters as needed.
44+
45+
The call returns an HTTP 201 response code if the corpus is valid. It then asynchronously pre-processes the contents of the corpus and automatically extracts new words that it finds.
46+
This can take on the order of a minute or two to complete depending on the total number of words and the number of new words in the corpus, as well as the current load on the service.
47+
You cannot submit requests to add additional corpora or words to the custom model, or to train the model, until the service's analysis of the corpus for the current request completes.
48+
Use the List corpora method to check the status of the analysis.
49+
50+
The service auto-populates the model's words resource with any word that is not found in its base vocabulary; these are referred to as out-of-vocabulary (OOV) words.
51+
You can use the List custom words method to examine the words resource, using other words method to eliminate typos and modify how words are pronounced as needed.
52+
53+
To add a corpus file that has the same name as an existing corpus, set the allow_overwrite query parameter to true; otherwise, the request fails.
54+
Overwriting an existing corpus causes the service to process the corpus text file and extract OOV words anew.
55+
Before doing so, it removes any OOV words associated with the existing corpus from the model's words resource unless they were also added by another corpus or they have been modified in some way with the Add custom words or Add a custom word method.
56+
57+
The service limits the overall amount of data that you can add to a custom model to a maximum of 10 million total words from all corpora combined.
58+
Also, you can add no more than 30 thousand new words to a model; this includes words that the service extracts from corpora and words that you add directly.

Examples/ServiceExamples/TestData/test-stt-corpus.txt.meta

Lines changed: 8 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)