Skip to content
This repository was archived by the owner on Jan 24, 2025. It is now read-only.

Better string comparison #276

Merged
merged 20 commits into from
Jul 24, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 63 additions & 2 deletions core/src/main/scala/com/softwaremill/diffx/DiffResult.scala
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,21 @@ trait DiffResult extends Product with Serializable {

object DiffResult {
private[diffx] final val indentLevel = 5
private[diffx] def mergeChunks(diffs: List[DiffResult]) = {
diffs
.foldLeft(List.empty[DiffResult]) { (acc, item) =>
(acc.lastOption, item) match {
case (Some(d: DiffResultMissingChunk), di: DiffResultMissingChunk) =>
acc.dropRight(1) :+ d.copy(value = d.value + di.value)
case (Some(d: DiffResultAdditionalChunk), di: DiffResultAdditionalChunk) =>
acc.dropRight(1) :+ d.copy(value = d.value + di.value)
case (Some(d: DiffResultChunk), di: DiffResultChunk) =>
acc.dropRight(1) :+ d.copy(left = d.left + di.left, right = d.right + di.right)
case _ => acc :+ item
}
}
}

val Ignored: IdenticalValue[Any] = IdenticalValue("<ignored>")
}

Expand Down Expand Up @@ -102,6 +117,38 @@ case class DiffResultString(diffs: List[DiffResult]) extends DiffResult {
override def isIdentical: Boolean = diffs.forall(_.isIdentical)
}

case class DiffResultStringLine(diffs: List[DiffResult]) extends DiffResult {
override private[diffx] def showIndented(indent: Int, renderIdentical: Boolean)(implicit
c: ConsoleColorConfig
): String = {
mergeChunks(diffs)
.map(_.showIndented(indent, renderIdentical))
.mkString
}

override def isIdentical: Boolean = diffs.forall(_.isIdentical)
}

case class DiffResultStringWord(diffs: List[DiffResult]) extends DiffResult {
override private[diffx] def showIndented(indent: Int, renderIdentical: Boolean)(implicit
c: ConsoleColorConfig
): String = {
mergeChunks(diffs)
.map(_.showIndented(indent, renderIdentical))
.mkString
}

override def isIdentical: Boolean = diffs.forall(_.isIdentical)
}

case class DiffResultChunk(left: String, right: String) extends DiffResult {
override def isIdentical: Boolean = false

override private[diffx] def showIndented(indent: Int, renderIdentical: Boolean)(implicit c: ConsoleColorConfig) = {
arrowColor("[") + showChange(s"$left", s"$right") + arrowColor("]")
}
}

case class DiffResultValue[T](left: T, right: T) extends DiffResult {
override def showIndented(indent: Int, renderIdentical: Boolean)(implicit c: ConsoleColorConfig): String =
showChange(s"$left", s"$right")
Expand All @@ -118,14 +165,28 @@ case class IdenticalValue[T](value: T) extends DiffResult {

case class DiffResultMissing[T](value: T) extends DiffResult {
override def showIndented(indent: Int, renderIdentical: Boolean)(implicit c: ConsoleColorConfig): String = {
rightColor(s"$value")
leftColor(s"$value")
}
override def isIdentical: Boolean = false
}

case class DiffResultMissingChunk(value: String) extends DiffResult {
override def showIndented(indent: Int, renderIdentical: Boolean)(implicit c: ConsoleColorConfig): String = {
leftColor(s"[$value]")
}
override def isIdentical: Boolean = false
}

case class DiffResultAdditional[T](value: T) extends DiffResult {
override def showIndented(indent: Int, renderIdentical: Boolean)(implicit c: ConsoleColorConfig): String = {
leftColor(s"$value")
rightColor(s"$value")
}
override def isIdentical: Boolean = false
}

case class DiffResultAdditionalChunk(value: String) extends DiffResult {
override def showIndented(indent: Int, renderIdentical: Boolean)(implicit c: ConsoleColorConfig): String = {
rightColor(s"[$value]")
}
override def isIdentical: Boolean = false
}
127 changes: 105 additions & 22 deletions core/src/main/scala/com/softwaremill/diffx/instances/DiffForString.scala
Original file line number Diff line number Diff line change
@@ -1,32 +1,115 @@
package com.softwaremill.diffx.instances

import com.softwaremill.diffx._
import com.softwaremill.diffx.instances.string.DiffRow.Tag
import com.softwaremill.diffx.instances.string.{DiffRow, DiffRowGenerator}

private[diffx] class DiffForString extends Diff[String] {
override def apply(left: String, right: String, context: DiffContext): DiffResult = nullGuard(left, right) {
(left, right) =>
val leftLines = left.split("\n").toList
val rightLines = right.split("\n").toList
val leftAsMap = leftLines.lift
val rightAsMap = rightLines.lift
val maxSize = Math.max(leftLines.length, rightLines.length)
val partialResults = (0 until maxSize).map { i =>
(leftAsMap(i), rightAsMap(i)) match {
case (Some(lv), Some(rv)) =>
if (lv == rv) {
IdenticalValue(lv)
class DiffForString(similarityThreshold: Double = 0.5) extends Diff[String] {
private val generator = DiffRowGenerator.create

override def apply(left: String, right: String, context: DiffContext): DiffResult =
nullGuard(left, right) { (left, right) =>
val rows = generator.generateDiffRows(splitIntoLines(left), splitIntoLines(right))
val lineResults = processLineDiffs(rows)
if (lineResults.forall(_.isIdentical)) {
IdenticalValue(left)
} else {
DiffResultString(lineResults)
}
}

private def processLineDiffs(rows: List[DiffRow]) = {
rows.map { row =>
row.tag match {
case Tag.INSERT => DiffResultMissing(row.newLine)
case Tag.DELETE => DiffResultAdditional(row.oldLine)
case Tag.CHANGE =>
if (row.newLine.isEmpty) {
DiffResultAdditional(row.oldLine)
} else if (row.oldLine.isEmpty) {
DiffResultMissing(row.newLine)
} else {
val oldSplit = tokenize(row.oldLine)
val newSplit = tokenize(row.newLine)
val wordDiffs = generator.generateDiffRows(
oldSplit,
newSplit
)
val words = processWordDiffs(wordDiffs)
DiffResultStringLine(words)
}
case Tag.EQUAL =>
IdenticalValue(row.newLine)
}
}
}

private def tokenize(line: String): List[String] = {
line
.foldLeft(List.empty[List[Char]]) { (acc, item) =>
acc.lastOption match {
case Some(word) =>
if (item == ' ') {
acc ++ List(List(item))
} else {
DiffResultValue(lv, rv)
if (word.lastOption.contains(' ')) {
acc :+ List(item)
} else {
acc.dropRight(1) :+ (word ++ List(item))
}
}
case (Some(lv), None) => DiffResultAdditional(lv)
case (None, Some(rv)) => DiffResultMissing(rv)
case (None, None) => throw new IllegalStateException("That should never happen")
case None => acc :+ List(item)
}
}.toList
if (partialResults.forall(_.isIdentical)) {
IdenticalValue(left)
} else {
DiffResultString(partialResults)
}
.map(_.mkString)
}

private def processWordDiffs(words: List[DiffRow]): List[DiffResult] = {
words.map { wordDiff =>
wordDiff.tag match {
case Tag.INSERT => DiffResultMissingChunk(wordDiff.newLine)
case Tag.DELETE => DiffResultAdditionalChunk(wordDiff.oldLine)
case Tag.CHANGE =>
if (wordDiff.newLine.isEmpty) {
DiffResultAdditionalChunk(wordDiff.oldLine)
} else if (wordDiff.oldLine.isEmpty) {
DiffResultMissingChunk(wordDiff.newLine)
} else {
val charDiff = generator.generateDiffRows(
wordDiff.oldLine.toList.map(_.toString),
wordDiff.newLine.toList.map(_.toString)
)
val similarity = charDiff.count(_.tag == Tag.EQUAL).toDouble / charDiff.size
if (similarity < similarityThreshold) {
DiffResultValue(wordDiff.oldLine, wordDiff.newLine)
} else {
DiffResultStringWord(processCharDiffs(charDiff))
}
}
case Tag.EQUAL => IdenticalValue(wordDiff.newLine)
}
}
}

private def processCharDiffs(chars: List[DiffRow]): List[DiffResult] = {
chars.map { charDiff =>
charDiff.tag match {
case Tag.INSERT => DiffResultMissingChunk(charDiff.newLine)
case Tag.DELETE => DiffResultAdditionalChunk(charDiff.oldLine)
case Tag.CHANGE =>
if (charDiff.newLine.isEmpty) {
DiffResultAdditionalChunk(charDiff.oldLine)
} else if (charDiff.oldLine.isEmpty) {
DiffResultMissingChunk(charDiff.newLine)
} else {
DiffResultChunk(charDiff.oldLine, charDiff.newLine)
}
case Tag.EQUAL => IdenticalValue(charDiff.newLine)
}
}
}

private def splitIntoLines(string: String) = {
string.replace("\r\n", "\n").split("\n").toList
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
package com.softwaremill.diffx.instances.string

case class Chunk[T](position: Int, lines: List[T]) {
def size: Int = lines.size
def last: Int = position + size - 1
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
package com.softwaremill.diffx.instances.string

import com.softwaremill.diffx.instances.string.Delta.TYPE

sealed abstract class Delta[T](original: Chunk[T], revised: Chunk[T]) {

def getType: TYPE
def getOriginal: Chunk[T] = original
def getRevised: Chunk[T] = revised
def getSource: Chunk[T] = original
def getTarget: Chunk[T] = revised
override def toString: String = s"Delta($getType, $getOriginal, $getRevised)"
}

object Delta {
sealed abstract class TYPE
object TYPE {
case object CHANGE extends TYPE
case object DELETE extends TYPE
case object INSERT extends TYPE
}
}
class ChangeDelta[T](original: Chunk[T], revised: Chunk[T]) extends Delta(original, revised) {
override def getType: TYPE = TYPE.CHANGE
}
class InsertDelta[T](original: Chunk[T], revised: Chunk[T]) extends Delta(original, revised) {
override def getType: TYPE = TYPE.INSERT
}
class DeleteDelta[T](original: Chunk[T], revised: Chunk[T]) extends Delta(original, revised) {
override def getType: TYPE = TYPE.DELETE
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
package com.softwaremill.diffx.instances.string

object DiffRow {

sealed trait Tag
object Tag {
case object INSERT extends Tag
case object DELETE extends Tag
case object CHANGE extends Tag
case object EQUAL extends Tag
}
}

case class DiffRow(tag: DiffRow.Tag, oldLine: String, newLine: String)
Loading