@@ -64,27 +64,59 @@ __mismatch(_Tp* __first1, _Tp* __last1, _Tp* __first2, _Pred& __pred, _Proj1& __
64
64
constexpr size_t __unroll_count = 4 ;
65
65
constexpr size_t __vec_size = __native_vector_size<_Tp>;
66
66
using __vec = __simd_vector<_Tp, __vec_size>;
67
- while (!__libcpp_is_constant_evaluated () && static_cast <size_t >(__last1 - __first1) >= __unroll_count * __vec_size)
68
- [[__unlikely__]] {
69
- __vec __lhs[__unroll_count];
70
- __vec __rhs[__unroll_count];
71
-
72
- for (size_t __i = 0 ; __i != __unroll_count; ++__i) {
73
- __lhs[__i] = std::__load_vector<__vec>(__first1 + __i * __vec_size);
74
- __rhs[__i] = std::__load_vector<__vec>(__first2 + __i * __vec_size);
67
+
68
+ if (!__libcpp_is_constant_evaluated ()) {
69
+ auto __orig_first1 = __first1;
70
+ auto __last2 = __first2 + (__last1 - __first1);
71
+ while (static_cast <size_t >(__last1 - __first1) >= __unroll_count * __vec_size) [[__unlikely__]] {
72
+ __vec __lhs[__unroll_count];
73
+ __vec __rhs[__unroll_count];
74
+
75
+ for (size_t __i = 0 ; __i != __unroll_count; ++__i) {
76
+ __lhs[__i] = std::__load_vector<__vec>(__first1 + __i * __vec_size);
77
+ __rhs[__i] = std::__load_vector<__vec>(__first2 + __i * __vec_size);
78
+ }
79
+
80
+ for (size_t __i = 0 ; __i != __unroll_count; ++__i) {
81
+ if (auto __cmp_res = __lhs[__i] == __rhs[__i]; !std::__all_of (__cmp_res)) {
82
+ auto __offset = __i * __vec_size + std::__find_first_not_set (__cmp_res);
83
+ return {__first1 + __offset, __first2 + __offset};
84
+ }
85
+ }
86
+
87
+ __first1 += __unroll_count * __vec_size;
88
+ __first2 += __unroll_count * __vec_size;
75
89
}
76
90
77
- for (size_t __i = 0 ; __i != __unroll_count; ++__i) {
78
- if (auto __cmp_res = __lhs[__i] == __rhs[__i]; !std::__all_of (__cmp_res)) {
79
- auto __offset = __i * __vec_size + std::__find_first_not_set (__cmp_res);
91
+ // check the remaining 0-3 vectors
92
+ while (static_cast <size_t >(__last1 - __first1) >= __vec_size) {
93
+ if (auto __cmp_res = std::__load_vector<__vec>(__first1) == std::__load_vector<__vec>(__first2);
94
+ !std::__all_of (__cmp_res)) {
95
+ auto __offset = std::__find_first_not_set (__cmp_res);
80
96
return {__first1 + __offset, __first2 + __offset};
81
97
}
98
+ __first1 += __vec_size;
99
+ __first2 += __vec_size;
82
100
}
83
101
84
- __first1 += __unroll_count * __vec_size;
85
- __first2 += __unroll_count * __vec_size;
102
+ if (__last1 - __first1 == 0 )
103
+ return {__first1, __first2};
104
+
105
+ // Check if we can load elements in fron of the current pointer. If that's the case load a vector at
106
+ // (last - vector_size) to check the remaining elements
107
+ if (static_cast <size_t >(__first1 - __orig_first1) >= __vec_size) {
108
+ __first1 = __last1 - __vec_size;
109
+ __first2 = __last2 - __vec_size;
110
+ auto __offset =
111
+ std::__find_first_not_set (std::__load_vector<__vec>(__first1) == std::__load_vector<__vec>(__first2));
112
+ return {__first1 + __offset, __first2 + __offset};
113
+ } // else loop over the elements individually
114
+
115
+ // TODO: Consider vectorizing the loop tail further with
116
+ // - smaller vectors
117
+ // - loading bytes out of range if it's known to be safe
86
118
}
87
- // TODO: Consider vectorizing the tail
119
+
88
120
return std::__mismatch_loop (__first1, __last1, __first2, __pred, __proj1, __proj2);
89
121
}
90
122
0 commit comments