File tree Expand file tree Collapse file tree 2 files changed +29
-2
lines changed Expand file tree Collapse file tree 2 files changed +29
-2
lines changed Original file line number Diff line number Diff line change @@ -1379,7 +1379,13 @@ int main(int argc, char **argv)
1379
1379
}
1380
1380
}
1381
1381
1382
- const json data = format_final_response (llama, llama.generated_text , llama.generated_token_probs );
1382
+ auto probs = llama.generated_token_probs ;
1383
+ if (llama.params .n_probs > 0 && llama.stopped_word ) {
1384
+ const std::vector<llama_token> stop_word_toks = llama_tokenize (llama.ctx , llama.stopping_word , false );
1385
+ probs = std::vector<completion_token_output>(llama.generated_token_probs .begin (), llama.generated_token_probs .end () - stop_word_toks.size ());
1386
+ }
1387
+
1388
+ const json data = format_final_response (llama, llama.generated_text , probs);
1383
1389
1384
1390
llama_print_timings (llama.ctx );
1385
1391
@@ -1456,7 +1462,11 @@ int main(int argc, char **argv)
1456
1462
1457
1463
if (!llama.has_next_token ) {
1458
1464
// Generation is done, send extra information.
1459
- const json data = format_final_response (llama, " " , llama.generated_token_probs );
1465
+ const json data = format_final_response (
1466
+ llama,
1467
+ " " ,
1468
+ std::vector<completion_token_output>(llama.generated_token_probs .begin (), llama.generated_token_probs .begin () + sent_token_probs_index)
1469
+ );
1460
1470
1461
1471
const std::string str =
1462
1472
" data: " +
Original file line number Diff line number Diff line change 81
81
#if defined(GGML_USE_HIPBLAS)
82
82
#define __CUDA_ARCH__ 1300
83
83
84
+ #ifndef __has_builtin
85
+ #define __has_builtin (x ) 0
86
+ #endif
87
+
84
88
typedef int8_t int8x4_t __attribute__ ((ext_vector_type(4 )));
85
89
static __device__ __forceinline__ int __vsubss4 (const int a, const int b) {
86
90
const int8x4_t va = reinterpret_cast <const int8x4_t &>(a);
87
91
const int8x4_t vb = reinterpret_cast <const int8x4_t &>(b);
92
+ #if __has_builtin(__builtin_elementwise_sub_sat)
88
93
const int8x4_t c = __builtin_elementwise_sub_sat (va, vb);
89
94
return reinterpret_cast <const int &>(c);
95
+ #else
96
+ int8x4_t c;
97
+ int16_t tmp;
98
+ #pragma unroll
99
+ for (int i = 0 ; i < 4 ; i++) {
100
+ tmp = va[i] - vb[i];
101
+ if (tmp > std::numeric_limits<int8_t >::max ()) tmp = std::numeric_limits<int8_t >::max ();
102
+ if (tmp < std::numeric_limits<int8_t >::min ()) tmp = std::numeric_limits<int8_t >::min ();
103
+ c[i] = tmp;
104
+ }
105
+ return reinterpret_cast <int &>(c);
106
+ #endif // __has_builtin(__builtin_elementwise_sub_sat)
90
107
}
91
108
92
109
static __device__ __forceinline__ int __dp4a (const int a, const int b, int c) {
You can’t perform that action at this time.
0 commit comments