@@ -474,6 +474,7 @@ static void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
474
474
int ne0, int ne1, int ne2, int ne3,
475
475
int ne10, int ne11, int ne12, int ne13,
476
476
/* int s0, */ int s1, int s2, int s3,
477
+ /* int s00,*/ int s01, int s02, int s03,
477
478
/* int s10,*/ int s11, int s12, int s13,
478
479
const sycl::nd_item<3 > &item_ct1) {
479
480
const int i0s = item_ct1.get_local_range (2 ) * item_ct1.get_group (2 ) +
@@ -495,9 +496,9 @@ static void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
495
496
const int i12 = i2 % ne12;
496
497
const int i13 = i3 % ne13;
497
498
498
- const size_t i_src0 = i3*s3 + i2*s2 + i1*s1 ;
499
+ const size_t i_src0 = i3*s03 + i2*s02 + i1*s01 ;
499
500
const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
500
- const size_t i_dst = i_src0 ;
501
+ const size_t i_dst = i3*s3 + i2*s2 + i1*s1 ;
501
502
502
503
const src0_t * src0_row = src0 + i_src0;
503
504
const src1_t * src1_row = src1 + i_src1;
@@ -515,6 +516,7 @@ static void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t
515
516
int ne0, int ne1, int ne2, int ne3,
516
517
int ne10, int ne11, int ne12, int ne13,
517
518
/* int s0, */ int s1, int s2, int s3,
519
+ /* int s00,*/ int s01, int s02, int s03,
518
520
/* int s10,*/ int s11, int s12, int s13,
519
521
const sycl::nd_item<3 > &item_ct1) {
520
522
@@ -534,9 +536,9 @@ static void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t
534
536
const int i12 = i2 % ne12;
535
537
const int i13 = i3 % ne13;
536
538
537
- const size_t i_src0 = i3*s3 + i2*s2 + i1*s1 ;
539
+ const size_t i_src0 = i3*s03 + i2*s02 + i1*s01 ;
538
540
const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
539
- const size_t i_dst = i_src0 ;
541
+ const size_t i_dst = i3*s3 + i2*s2 + i1*s1 ;
540
542
541
543
const src0_t * src0_row = src0 + i_src0;
542
544
const src1_t * src1_row = src1 + i_src1;
@@ -566,9 +568,11 @@ struct bin_bcast_sycl {
566
568
int nr[4 ] = { nr0, nr1, nr2, nr3 };
567
569
568
570
// collapse dimensions until first broadcast dimension
569
- int64_t cne0[] = {ne0, ne1, ne2, ne3};
571
+ int64_t cne[] = {ne0, ne1, ne2, ne3};
572
+ int64_t cne0[] = {ne00, ne01, ne02, ne03};
570
573
int64_t cne1[] = {ne10, ne11, ne12, ne13};
571
- size_t cnb0[] = {nb0, nb1, nb2, nb3};
574
+ size_t cnb[] = {nb0, nb1, nb2, nb3};
575
+ size_t cnb0[] = {nb00, nb01, nb02, nb03};
572
576
size_t cnb1[] = {nb10, nb11, nb12, nb13};
573
577
auto collapse = [](int64_t cne[]) {
574
578
cne[0 ] *= cne[1 ];
@@ -583,32 +587,41 @@ struct bin_bcast_sycl {
583
587
cnb[3 ] *= cne[3 ];
584
588
};
585
589
586
- for (int i = 0 ; i < 4 ; i++) {
587
- if (nr[i] != 1 ) {
588
- break ;
589
- }
590
- if (i > 0 ) {
591
- collapse_nb (cnb0, cne0);
592
- collapse_nb (cnb1, cne1);
593
- collapse (cne0);
594
- collapse (cne1);
590
+ if (ggml_is_contiguous (src0) && ggml_is_contiguous (src1) && ggml_is_contiguous (dst)) {
591
+ for (int i = 0 ; i < 4 ; i++) {
592
+ if (nr[i] != 1 ) {
593
+ break ;
594
+ }
595
+ if (i > 0 ) {
596
+ collapse_nb (cnb, cne);
597
+ collapse_nb (cnb0, cne0);
598
+ collapse_nb (cnb1, cne1);
599
+ collapse (cne);
600
+ collapse (cne0);
601
+ collapse (cne1);
602
+ }
595
603
}
596
604
}
597
605
{
598
- int64_t ne0 = cne0 [0 ];
599
- int64_t ne1 = cne0 [1 ];
600
- int64_t ne2 = cne0 [2 ];
601
- int64_t ne3 = cne0 [3 ];
606
+ int64_t ne0 = cne [0 ];
607
+ int64_t ne1 = cne [1 ];
608
+ int64_t ne2 = cne [2 ];
609
+ int64_t ne3 = cne [3 ];
602
610
603
611
int64_t ne10 = cne1[0 ];
604
612
int64_t ne11 = cne1[1 ];
605
613
int64_t ne12 = cne1[2 ];
606
614
int64_t ne13 = cne1[3 ];
607
615
608
- size_t nb0 = cnb0[0 ];
609
- size_t nb1 = cnb0[1 ];
610
- size_t nb2 = cnb0[2 ];
611
- size_t nb3 = cnb0[3 ];
616
+ size_t nb0 = cnb[0 ];
617
+ size_t nb1 = cnb[1 ];
618
+ size_t nb2 = cnb[2 ];
619
+ size_t nb3 = cnb[3 ];
620
+
621
+ size_t nb00 = cnb0[0 ];
622
+ size_t nb01 = cnb0[1 ];
623
+ size_t nb02 = cnb0[2 ];
624
+ size_t nb03 = cnb0[3 ];
612
625
613
626
size_t nb10 = cnb1[0 ];
614
627
size_t nb11 = cnb1[1 ];
@@ -625,6 +638,28 @@ struct bin_bcast_sycl {
625
638
size_t s12 = nb12 / sizeof (src1_t );
626
639
size_t s13 = nb13 / sizeof (src1_t );
627
640
641
+ size_t s00 = nb00 / sizeof (src0_t );
642
+ size_t s01 = nb01 / sizeof (src0_t );
643
+ size_t s02 = nb02 / sizeof (src0_t );
644
+ size_t s03 = nb03 / sizeof (src0_t );
645
+
646
+ GGML_UNUSED (s00);
647
+
648
+ GGML_ASSERT (nb0 % sizeof (dst_t ) == 0 );
649
+ GGML_ASSERT (nb1 % sizeof (dst_t ) == 0 );
650
+ GGML_ASSERT (nb2 % sizeof (dst_t ) == 0 );
651
+ GGML_ASSERT (nb3 % sizeof (dst_t ) == 0 );
652
+
653
+ GGML_ASSERT (nb00 % sizeof (src0_t ) == 0 );
654
+ GGML_ASSERT (nb01 % sizeof (src0_t ) == 0 );
655
+ GGML_ASSERT (nb02 % sizeof (src0_t ) == 0 );
656
+ GGML_ASSERT (nb03 % sizeof (src0_t ) == 0 );
657
+
658
+ GGML_ASSERT (nb10 % sizeof (src1_t ) == 0 );
659
+ GGML_ASSERT (nb11 % sizeof (src1_t ) == 0 );
660
+ GGML_ASSERT (nb12 % sizeof (src1_t ) == 0 );
661
+ GGML_ASSERT (nb13 % sizeof (src1_t ) == 0 );
662
+
628
663
GGML_ASSERT (s0 == 1 );
629
664
GGML_ASSERT (s10 == 1 );
630
665
@@ -661,8 +696,8 @@ struct bin_bcast_sycl {
661
696
[=](sycl::nd_item<3 > item_ct1) {
662
697
k_bin_bcast_unravel<bin_op>(
663
698
src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3,
664
- ne10, ne11, ne12, ne13, s1, s2, s3, s11, s12 ,
665
- s13, item_ct1);
699
+ ne10, ne11, ne12, ne13, s1, s2, s3, s01, s02 ,
700
+ s03, s11, s12, s13, item_ct1);
666
701
});
667
702
}
668
703
} else {
@@ -680,7 +715,7 @@ struct bin_bcast_sycl {
680
715
[=](sycl::nd_item<3 > item_ct1) {
681
716
k_bin_bcast<bin_op>(src0_dd, src1_dd, dst_dd, ne0, ne1,
682
717
ne2, ne3, ne10, ne11, ne12, ne13,
683
- s1, s2, s3, s11, s12, s13,
718
+ s1, s2, s3, s01, s02, s03, s11, s12, s13,
684
719
item_ct1);
685
720
});
686
721
}
0 commit comments