Skip to content

Commit 2459f4e

Browse files
authored
Merge pull request #423 from ldbc/fix-parallelism-dependent-factor-tables
Factorgen: Make people4Hops deterministic regardless of the degree of parallelism
2 parents 4439c68 + 0547dbb commit 2459f4e

File tree

1 file changed

+6
-0
lines changed

1 file changed

+6
-0
lines changed

src/main/scala/ldbc/snb/datagen/factors/FactorGenerationStage.scala

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,8 @@ object FactorGenerationStage extends DatagenStage with Logging {
173173
date_trunc("day", $"creationDate").as("creationDay"),
174174
date_trunc("day", $"deletionDate").as("deletionDay"),
175175
$"MessageId")
176+
.orderBy($"MessageId")
177+
176178
val sampleSize = 20000.0
177179
val count = messages.count()
178180
val sampleFraction = Math.min(sampleSize / count, 1.0)
@@ -407,6 +409,7 @@ object FactorGenerationStage extends DatagenStage with Logging {
407409
log.info(s"Factor people4Hops: using ${sampleSize} samples (${sampleFraction * 100}%)")
408410

409411
peopleInChina
412+
.orderBy($"Person.id")
410413
.sample(sampleFraction, 42)
411414
.join(relations.alias("knows"), $"Person.id" === $"knows.Person1Id")
412415
.select($"knows.Person1Id".alias("Person1Id"), $"knows.Person2Id".alias("Person2Id"))
@@ -428,6 +431,7 @@ object FactorGenerationStage extends DatagenStage with Logging {
428431
$"Person2.creationDate".as("Person2CreationDate"),
429432
$"Person2.deletionDate".as("Person2DeletionDate")
430433
)
434+
.orderBy($"Person1Id", $"Person2Id")
431435

432436
val sampleFractionPersonPairs = Math.min(10000.0 / personPairs.count(), 1.0)
433437
personPairs.sample(sampleFractionPersonPairs, 42)
@@ -455,6 +459,7 @@ object FactorGenerationStage extends DatagenStage with Logging {
455459
log.info(s"Factor people4Hops: using ${sampleSize} samples (${sampleFraction * 100}%)")
456460

457461
peopleInChina
462+
.orderBy($"Person.id")
458463
.sample(sampleFraction, 42)
459464
.join(relations.alias("knows"), $"Person.id" === $"knows.Person1Id")
460465
.select($"knows.Person1Id".alias("Person1Id"), $"knows.Person2Id".alias("Person2Id"))
@@ -476,6 +481,7 @@ object FactorGenerationStage extends DatagenStage with Logging {
476481
$"Person2.creationDate".as("Person2CreationDate"),
477482
$"Person2.deletionDate".as("Person2DeletionDate")
478483
)
484+
.orderBy($"Person1Id", $"Person2Id")
479485

480486
val sampleFractionPersonPairs = Math.min(10000.0 / personPairs.count(), 1.0)
481487
personPairs.sample(sampleFractionPersonPairs, 42)

0 commit comments

Comments
 (0)