Skip to content

Commit af05f0d

Browse files
v-shobhitmlcommons-botmrmhodak
authored
Parallelize evaluation of llama2 rouge scores (#1995)
* Parallelize evaluation of rouge scores * use variable for repeated val * [Automated Commit] Format Codebase --------- Co-authored-by: mlcommons-bot <[email protected]> Co-authored-by: Miro <[email protected]>
1 parent 88f4d23 commit af05f0d

File tree

1 file changed

+39
-9
lines changed

1 file changed

+39
-9
lines changed

language/llama2-70b/evaluate-accuracy.py

Lines changed: 39 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import evaluate
55
import numpy as np
66
import json
7+
from multiprocessing import Pool, cpu_count
78

89

910
def get_args():
@@ -52,12 +53,21 @@ def postprocess_text(preds, targets):
5253
return preds, targets
5354

5455

56+
def compute_rouge_chunk(chunk):
57+
"""Compute ROUGE scores for a chunk of predictions and references."""
58+
metric = evaluate.load("rouge")
59+
preds, targets = chunk
60+
result = metric.compute(
61+
predictions=preds, references=targets, use_stemmer=True, use_aggregator=False
62+
)
63+
return result
64+
65+
5566
def main():
5667

5768
args = get_args()
5869
dataset_path = args.dataset_file
5970
checkpoint_path = args.checkpoint_path
60-
metric = evaluate.load("rouge")
6171
nltk.download("punkt")
6272
nltk.download("punkt_tab")
6373

@@ -103,23 +113,43 @@ def main():
103113

104114
preds, targets = postprocess_text(preds_decoded_text, target_required)
105115

106-
result = metric.compute(
107-
predictions=preds, references=targets, use_stemmer=True, use_aggregator=False
108-
)
109-
result = {k: round(np.mean(v) * 100, 4) for k, v in result.items()}
116+
# Split data into chunks for parallel processing
117+
num_chunks = cpu_count() # Number of parallel processes
118+
chunk_size = len(preds) // num_chunks + (len(preds) % num_chunks > 0)
119+
120+
chunks = [
121+
(preds[i:i + chunk_size], targets[i:i + chunk_size])
122+
for i in range(0, len(preds), chunk_size)
123+
]
124+
125+
# Use multiprocessing Pool to compute ROUGE scores in parallel
126+
with Pool(num_chunks) as pool:
127+
results_list = pool.map(compute_rouge_chunk, chunks)
128+
129+
# Aggregate results from all chunks
130+
aggregated_results = {}
131+
132+
for result in results_list:
133+
for k, v in result.items():
134+
if k not in aggregated_results:
135+
aggregated_results[k] = []
136+
aggregated_results[k].extend(v)
137+
138+
final_result = {k: round(np.mean(v) * 100, 4)
139+
for k, v in aggregated_results.items()}
140+
110141
prediction_lens = [len(pred) for pred in preds]
111142
gen_num = len(preds)
112143

113-
result = {
114-
**result,
144+
final_result.update({
115145
"gen_len": np.sum(prediction_lens),
116146
"gen_num": gen_num,
117147
"gen_tok_len": gen_tok_len,
118148
"tokens_per_sample": round(gen_tok_len / gen_num, 1),
119-
}
149+
})
120150

121151
print("\nResults\n")
122-
print(result)
152+
print(final_result)
123153

124154

125155
if __name__ == "__main__":

0 commit comments

Comments
 (0)