@@ -574,9 +574,18 @@ void submit_sliding_window1d(const PaddedSpan<const T, SizeT> &a,
574
574
}
575
575
576
576
auto *const out_ptr = out.begin ();
577
- auto *const out_end = out.end ();
578
- results.store (&out_ptr[glid],
579
- [out_end](auto &&ptr) { return ptr < out_end; });
577
+ // auto *const out_end = out.end();
578
+
579
+ auto y_start = glid;
580
+ auto y_stop = std::min (y_start + WorkPI*results.size_x (), out.size ());
581
+ int32_t i = 0 ;
582
+ for (uint32_t y = y_start; y < y_stop; y+=results.size_x ())
583
+ {
584
+ out_ptr[y] = results[i++];
585
+ }
586
+ // due to excessive optimizations this code results in memory corruption
587
+ // results.store(&out_ptr[glid],
588
+ // [out_end](auto &&ptr) { return ptr < out_end; });
580
589
});
581
590
}
582
591
@@ -635,9 +644,18 @@ void submit_sliding_window1d_small_kernel(const PaddedSpan<const T, SizeT> &a,
635
644
red);
636
645
637
646
auto *const out_ptr = out.begin ();
638
- auto *const out_end = out.end ();
639
- results.store (&out_ptr[glid],
640
- [out_end](auto &&ptr) { return ptr < out_end; });
647
+ // auto *const out_end = out.end();
648
+
649
+ auto y_start = glid;
650
+ auto y_stop = std::min (y_start + WorkPI*results.size_x (), out.size ());
651
+ int32_t i = 0 ;
652
+ for (uint32_t y = y_start; y < y_stop; y+=results.size_x ())
653
+ {
654
+ out_ptr[y] = results[i++];
655
+ }
656
+ // due to excessive optimizations this code results in memory corruption
657
+ // results.store(&out_ptr[glid],
658
+ // [out_end](auto &&ptr) { return ptr < out_end; });
641
659
});
642
660
}
643
661
0 commit comments