Skip to content

[WIP] fix encoding issues #1494

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Oct 6, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 12 additions & 11 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
# These files are text and should be normalized (convert crlf => lf)
*.c text
*.check text
*.css text
*.html text
*.java text
*.js text
*.sbt text
*.scala text
*.sh text
*.txt text
*.xml text

*.c text eol=lf
*.check text eol=lf
*.css text eol=lf
*.html text eol=lf
*.java text eol=lf
*.js text eol=lf
*.sbt text eol=lf
*.scala text eol=lf
*.sh text eol=lf
*.txt text eol=lf
*.xml text eol=lf

# Windows-specific files get windows endings
*.bat eol=crlf
Expand Down
2 changes: 1 addition & 1 deletion dottydoc/test/BaseTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ import model.Package
trait DottyTest {
dotty.tools.dotc.parsing.Scanners // initialize keywords

implicit var ctx: FreshContext = {
implicit val ctx: FreshContext = {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This change seems irrelevant to the PR/commit. It shouldn't appear in the commit.

val base = new ContextBase
import base.settings._
val ctx = base.initialCtx.fresh
Expand Down
2 changes: 1 addition & 1 deletion project/Build.scala
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ object DottyBuild extends Build {
homepage in Global := Some(url("https://github.com/lampepfl/dotty")),

// scalac options
scalacOptions in Global ++= Seq("-feature", "-deprecation", "-language:existentials,higherKinds,implicitConversions"),
scalacOptions in Global ++= Seq("-feature", "-deprecation", "-encoding", "UTF8", "-language:existentials,higherKinds,implicitConversions"),

javacOptions in Global ++= Seq("-Xlint:unchecked", "-Xlint:deprecation")
)
Expand Down
6 changes: 4 additions & 2 deletions src/dotty/tools/dotc/Run.scala
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import Phases._
import Decorators._
import dotty.tools.dotc.transform.TreeTransforms.TreeTransformer
import io.PlainFile
import scala.io.Codec
import util._
import reporting.Reporter
import transform.TreeChecker
Expand All @@ -28,8 +29,9 @@ class Run(comp: Compiler)(implicit ctx: Context) {
var units: List[CompilationUnit] = _

def getSource(fileName: String): SourceFile = {
val encoding = ctx.settings.encoding.value
val f = new PlainFile(fileName)
if (f.exists) new SourceFile(f)
if (f.exists) new SourceFile(f, Codec(encoding))
else {
ctx.error(s"not found: $fileName")
NoSource
Expand Down Expand Up @@ -113,7 +115,7 @@ class Run(comp: Compiler)(implicit ctx: Context) {
val writer = new BufferedWriter(new OutputStreamWriter(virtualFile.output, "UTF-8")) // buffering is still advised by javadoc
writer.write(sourceCode)
writer.close()
compileSources(List(new SourceFile(virtualFile)))
compileSources(List(new SourceFile(virtualFile, Codec.UTF8)))
}

/** The context created for this run */
Expand Down
2 changes: 2 additions & 0 deletions src/dotty/tools/dotc/config/Settings.scala
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ object Settings {
private var values = ArrayBuffer(initialValues: _*)
private var _wasRead: Boolean = false

override def toString = s"SettingsState(values: ${values.toList})"

def value(idx: Int): Any = {
_wasRead = true
values(idx)
Expand Down
7 changes: 4 additions & 3 deletions src/dotty/tools/dotc/util/SourceFile.scala
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import java.io.IOException
import Chars._
import ScriptSourceFile._
import Positions._
import scala.io.Codec

import java.util.Optional

Expand All @@ -36,9 +37,9 @@ object ScriptSourceFile {

case class SourceFile(file: AbstractFile, content: Array[Char]) extends interfaces.SourceFile {

def this(_file: AbstractFile) = this(_file, _file.toCharArray)
def this(sourceName: String, cs: Seq[Char]) = this(new VirtualFile(sourceName), cs.toArray)
def this(file: AbstractFile, cs: Seq[Char]) = this(file, cs.toArray)
def this(_file: AbstractFile, codec: Codec) = this(_file, new String(_file.toByteArray, codec.charSet).toCharArray)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure using Codec is the best choice. scala.io.* is not really a nice API, and using it when we can do otherwise doesn't seem wise to me.

I suggest directly using java.nio.Charset.

def this(sourceName: String, cs: Seq[Char]) = this(new VirtualFile(sourceName), cs.toArray)
def this(file: AbstractFile, cs: Seq[Char]) = this(file, cs.toArray)

/** Tab increment; can be overridden */
def tabInc = 8
Expand Down
10 changes: 7 additions & 3 deletions test/dotc/tests.scala
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,7 @@ class tests extends CompilerTest {
val defaultOutputDir = "./out/"

implicit val defaultOptions = noCheckOptions ++ List(
"-Yno-deep-subtypes", "-Yno-double-bindings", "-Yforce-sbt-phases",
"-d", defaultOutputDir) ++ {
"-Yno-deep-subtypes", "-Yno-double-bindings", "-Yforce-sbt-phases", "-d", defaultOutputDir) ++ {
if (isRunByJenkins) List("-Ycheck:tailrec,resolveSuper,mixin,restoreScopes,labelDef") // should be Ycheck:all, but #725
else List("-Ycheck:tailrec,resolveSuper,mixin,restoreScopes,labelDef")
}
Expand All @@ -38,6 +37,9 @@ class tests extends CompilerTest {
val allowDoubleBindings = defaultOptions diff List("-Yno-double-bindings")
val scala2mode = List("-language:Scala2")

val explicitUTF8 = List("-encoding", "UTF8")
val explicitUTF16 = List("-encoding", "UTF16")

val testsDir = "./tests/"
val posDir = testsDir + "pos/"
val posSpecialDir = testsDir + "pos-special/"
Expand Down Expand Up @@ -95,7 +97,7 @@ class tests extends CompilerTest {
@Test def pos_overloadedAccess = compileFile(posDir, "overloadedAccess", twice)
@Test def pos_approximateUnion = compileFile(posDir, "approximateUnion", twice)
@Test def pos_tailcall = compileDir(posDir, "tailcall", twice)
@Test def pos_valueclasses = compileFiles(posDir + "valueclasses/", twice)
@Test def pos_valueclasses = compileFiles(posDir + "pos_valueclasses/", twice)
@Test def pos_nullarify = compileFile(posDir, "nullarify", args = "-Ycheck:nullarify" :: Nil)
@Test def pos_subtyping = compileFile(posDir, "subtyping", twice)
@Test def pos_packageObj = compileFile(posDir, "i0239", twice)
Expand All @@ -118,6 +120,8 @@ class tests extends CompilerTest {
compileFile(posSpecialDir, "spec-t5545/S_1")
compileFile(posSpecialDir, "spec-t5545/S_2")
}
@Test def pos_utf8 = compileFile(posSpecialDir, "utf8encoded", explicitUTF8)
@Test def pos_utf16 = compileFile(posSpecialDir, "utf16encoded", explicitUTF16)

@Test def new_all = compileFiles(newDir, twice)
@Test def repl_all = replFiles(replDir)
Expand Down
41 changes: 34 additions & 7 deletions test/test/CompilerTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ abstract class CompilerTest {

private def expectedErrors(filePath: String): List[ErrorsInFile] = expectedErrors(List(filePath))

private def isNegTest(testPath: String) = testPath.contains(JFile.separator + "neg" + JFile.separator)
private def isNegTest(testPath: String) = testPath.contains("/neg/")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this change have been part of another commit?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

32819e2 should probably be broken up in just the part that fixes dotty not honoring default and passed in -encoding arguments (which is an actual bug in dotty) and the parts that get the tests running on Windows (which is "just" a set of test-suite bugs). WDYT?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That sounds good :)


private def compileArgs(args: Array[String], expectedErrorsPerFile: List[ErrorsInFile])
(implicit defaultOptions: List[String]): Unit = {
Expand Down Expand Up @@ -413,7 +413,8 @@ abstract class CompilerTest {
val flags = oldFlags.map(f => if (f == oldOutput) partestOutput else f) ++
List(s"-classpath $partestOutput") // Required for separate compilation tests

getExisting(dest).isDifferent(source, flags, nerr) match {
val difference = getExisting(dest).isDifferent(source, flags, nerr)
difference match {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Useless diff?

case NotExists => copyFiles(source, dest, partestOutput, flags, nerr, kind)
case ExistsSame => // nothing else to do
case ExistsDifferent =>
Expand Down Expand Up @@ -449,11 +450,37 @@ abstract class CompilerTest {
/** Recursively copy over source files and directories, excluding extensions
* that aren't in extensionsToCopy. */
private def recCopyFiles(sourceFile: Path, dest: Path): Unit = {
processFileDir(sourceFile, { sf =>

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Useless diff?

def copyfile(file: SFile, bytewise: Boolean): Unit = {
if (bytewise) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of having a boolean param, you should have two separate defs: copyfile and copyfileBytewise.

val in = file.inputStream()
val out = SFile(dest).outputStream()
val buffer = new Array[Byte](1024)
def loop(available: Int):Unit = {
if (available < 0) {()}
else {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consider just if (available >= 0) {

out.write(buffer, 0, available)
val read = in.read(buffer)
loop(read)
}
}
loop(0)
in.close()
out.close()
} else {
try {
SFile(dest)(scala.io.Codec.UTF8).writeAll((s"/* !!!!! WARNING: DO NOT MODIFY. Original is at: $file !!!!! */").replace("\\", "/"), file.slurp("UTF-8"))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why the .replace("\\", "/")?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

some path parts start with u. Then comment then contains \u(something), which gets interpreted as an invalid unicode escape, failing compilation.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah ah!
Well, that deserves to be written as a comment above this line, then, IMO.

} catch {
case unmappable: java.nio.charset.MalformedInputException =>
copyfile(file, true) //there are bytes that can't be mapped with UTF-8. Bail and just do a straight byte-wise copy without the warning header.
}
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why this whole thing anyway? Files should always be copied bytewise, shouldn't they? They have an encoding before, and they should have the exact same encoding after. There is no need to decode and re-encode them.

The comment should be pure-ASCII anyway, so it doesn't really matter what encoding is used. It could actually be encoded as such (with US_ASCII) using a replacement char ? for characters of $file that are unmappable in US-ASCII.

That would be much simpler, not to mention faster.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the comment is pure-ASCII, but you can't write it to file if you don't know the encoding of the file; the encoding of the comment in UTF-16 or UTF-32 is unequal to the encoding in UTF-8.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

True. I stupidly did not think about UTF-16.

That said, this code only applies to files in this repo, doesn't it? We shouldn't have UTF-16-encoded files in this repo, so it should be fine. It would be a different matter if this code were applied to arbitrary files in users' repositories, of course.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is one in there now, namely the UTF-16 test - which tests whether the passed in -encoding compiler flag is honored. I didn't see any way to test that other than this. I could see if I can resort to tricks, finding an encoding/file that is both valid UTF-8, but doesn't compile as UTF-8, but is also a valid different encoding and does compile as that encoding. A third alternative is not having a test for it. None of the alternatives (this ugly thing which is only needed for one test, unclear tricks with a polyglot file encoding, or not having a test) are really appealing IMO.

Copy link
Member

@sjrd sjrd Sep 11, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Easier: use a latin1 test instead of a UTF-16 test. Use a French é in some identifier, that's all it takes. It won't be valid UTF-8, but it will parse and compile as latin1. Besides, latin1 is a superset of US-ASCII, so the comment can stay US-ASCII.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If it won't be valid UTF-8, then the file can't be slurped with UTF-8, and the file copy fails

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My comment was to use that in the potential variant where files are never slurped to being with. They are always copied byte-wise.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, I think I get what you mean: like this: martijnhoekstra@42fc87a ?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, like that.

}

processFileDir(sourceFile, { sf =>
if (extensionsToCopy.contains(sf.extension)) {
dest.parent.jfile.mkdirs
dest.toFile.writeAll("/* !!!!! WARNING: DO NOT MODIFY. Original is at: $sf !!!!! */",
sf.slurp())
copyfile(sf, false)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indentation issue (tabs)

} else {
log(s"WARNING: ignoring $sf")
}
Expand All @@ -465,7 +492,7 @@ abstract class CompilerTest {

/** Reads the existing files for the given test source if any. */
private def getExisting(dest: Path): ExistingFiles = {
val content: Option[Option[String]] = processFileDir(dest, f => f.safeSlurp, d => Some(""))
val content: Option[Option[String]] = processFileDir(dest, f => try Some(f.slurp("UTF8")) catch {case io: java.io.IOException => Some(io.toString())}, d => Some(""))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This ugly thing definitely need be factored out (also used a couple lines below).

What does this do, anyway? Shouldn't safeSlurp be responsible of this safety net?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

safeSlurp is problematic in that it can't take an encoding as an argument, and uses the encoding when the File is created, and many ways to create a File don't have the opportunity to pass an encoding either (in this case ifFile).

That said, it can still go from this line.

if (content.isDefined && content.get.isDefined) {
val flags = (dest changeExtension "flags").toFile.safeSlurp
val nerr = (dest changeExtension "nerr").toFile.safeSlurp
Expand All @@ -479,7 +506,7 @@ abstract class CompilerTest {
if (!genSrc.isDefined) {
NotExists
} else {
val source = processFileDir(sourceFile, { f => f.safeSlurp }, { d => Some("") },
val source = processFileDir(sourceFile, { f => try Some(f.slurp("UTF8")) catch {case _: java.io.IOException => None} }, { d => Some("") },
Some("DPCompilerTest sourceFile doesn't exist: " + sourceFile)).get
if (source == genSrc) {
nerr match {
Expand Down
1 change: 1 addition & 0 deletions test/test/DottyTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ class DottyTest /*extends ContextEscapeDetection*/ {
import base.settings._
val ctx = base.initialCtx.fresh
base.initialize()(ctx)
ctx.setSetting(ctx.settings.encoding, "UTF8")
ctx
}
/*
Expand Down
2 changes: 1 addition & 1 deletion test/test/InterfaceEntryPointTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ import scala.collection.mutable.ListBuffer
*/
class InterfaceEntryPointTest {
@Test def runCompilerFromInterface = {
val sources = List("./tests/pos/HelloWorld.scala")
val sources = List("./tests/pos/HelloWorld.scala").map(p => new java.io.File(p).getPath())
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it would be good to factor out this incantation as normalizePath(path: String), since it is used 3 times (currently).

val args = sources ++ List("-d", "./out/")

val mainClass = Class.forName("dotty.tools.dotc.Main")
Expand Down
4 changes: 2 additions & 2 deletions test/test/OtherEntryPointsTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ import scala.collection.mutable.ListBuffer
*/
class OtherEntryPointsTest {
@Test def runCompiler = {
val sources = List("./tests/pos/HelloWorld.scala")
val sources = List("./tests/pos/HelloWorld.scala").map(p => new java.io.File(p).getPath())
val args = sources ++ List("-d", "./out/")

val reporter = new CustomReporter
Expand All @@ -31,7 +31,7 @@ class OtherEntryPointsTest {
}

@Test def runCompilerWithContext = {
val sources = List("./tests/pos/HelloWorld.scala")
val sources = List("./tests/pos/HelloWorld.scala").map(p => new java.io.File(p).getPath())
val args = sources ++ List("-d", "./out/")

val reporter = new CustomReporter
Expand Down
3 changes: 2 additions & 1 deletion test/test/ParserTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import scala.reflect.io._
import dotty.tools.dotc.util._
import dotty.tools.dotc.core._
import dotty.tools.dotc.parsing._
import scala.io.Codec
import Tokens._, Parsers._
import dotty.tools.dotc.ast.untpd._
import org.junit.Test
Expand All @@ -23,7 +24,7 @@ class ParserTest extends DottyTest {

def parse(file: PlainFile): Tree = {
//println("***** parsing " + file)
val source = new SourceFile(file)
val source = new SourceFile(file, Codec.UTF8)
val parser = new Parser(source)
val tree = parser.parse()
parsed += 1
Expand Down
5 changes: 3 additions & 2 deletions test/test/ScannerTest.scala
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package test

import scala.reflect.io._
import scala.io.Codec
import dotty.tools.dotc.util._
import dotty.tools.dotc.parsing._
import Tokens._, Scanners._
Expand All @@ -16,8 +17,8 @@ class ScannerTest extends DottyTest {
def scan(name: String): Unit = scan(new PlainFile(name))

def scan(file: PlainFile): Unit = {
println("***** scanning " + file)
val source = new SourceFile(file)
//println("***** scanning " + file)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This comment seems irrelevant to the PR.

val source = new SourceFile(file, Codec.UTF8)
val scanner = new Scanner(source)
var i = 0
while (scanner.token != EOF) {
Expand Down
4 changes: 2 additions & 2 deletions test/test/TestREPL.scala
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ class TestREPL(script: String) extends REPL {
while (lines.hasNext && lines.head.startsWith(continuationPrompt)) {
val continued = lines.next
output.println(continued)
buf append "\n"
buf append System.lineSeparator()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this still needed, despite the comparison through .lines down below?

buf append continued.drop(continuationPrompt.length)
}
buf.toString
Expand All @@ -49,7 +49,7 @@ class TestREPL(script: String) extends REPL {
out.close()
val printed = out.toString
val transcript = printed.drop(printed.indexOf(config.prompt))
if (transcript.toString != script) {
if (transcript.toString.lines.toList != script.lines.toList) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can avoid the reification as Lists with sameElements:

if (transcript.toString.lines.sameElements(script.lines))

println("input differs from transcript:")
println(transcript)
assert(false)
Expand Down
Binary file added tests/pos-special/utf16encoded.scala
Binary file not shown.
8 changes: 8 additions & 0 deletions tests/pos-special/utf8encoded.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
//this file is saved as UTF-8
object Test {
def main(args: Array[String]): Unit = {
val testchar = '⇒'
println(testchar == '\u21D2')
}

}