jf-tech · jf-tech · Sep 19, 2022 · Sep 19, 2022
diff --git a/extensions/omniv21/fileformat/flatfile/csv/.snapshots/TestRead-multiple_records b/extensions/omniv21/fileformat/flatfile/csv/.snapshots/TestRead-multiple_records
@@ -4,7 +4,7 @@
 			"Children": [
 				{
 					"Children": null,
-					"Data": "v3",
+					"Data": "v1",
 					"FirstChild": null,
 					"FormatSpecific": null,
 					"LastChild": null,
@@ -15,9 +15,9 @@
 				}
 			],
 			"Data": "r1c1",
-			"FirstChild": "(TextNode 'v3')",
+			"FirstChild": "(TextNode 'v1')",
 			"FormatSpecific": null,
-			"LastChild": "(TextNode 'v3')",
+			"LastChild": "(TextNode 'v1')",
 			"NextSibling": "(ElementNode r1c2)",
 			"Parent": "(ElementNode r1)",
 			"PrevSibling": null,

diff --git a/extensions/omniv21/fileformat/flatfile/csv/.snapshots/TestValidateSchema-success b/extensions/omniv21/fileformat/flatfile/csv/.snapshots/TestValidateSchema-success
@@ -0,0 +1,38 @@
+{
+	"file_declaration": {
+		"delimiter": ",",
+		"records": [
+			{
+				"name": "e1",
+				"type": "record_group",
+				"is_target": true,
+				"min": 1,
+				"max": 1,
+				"child_records": [
+					{
+						"name": "e2",
+						"max": 5,
+						"columns": [
+							{
+								"name": "c1",
+								"index": 2
+							}
+						]
+					},
+					{
+						"name": "e2",
+						"header": "^ABC$",
+						"columns": [
+							{
+								"name": "c2",
+								"index": 1,
+								"line_pattern": "^H00"
+							}
+						]
+					}
+				]
+			}
+		]
+	},
+	"XPath": ".[c1 != 'skip']"
+}
diff --git a/extensions/omniv21/fileformat/flatfile/csv/decl.go b/extensions/omniv21/fileformat/flatfile/csv/decl.go
@@ -20,21 +20,21 @@ type ColumnDecl struct {
 	linePatternRegexp *regexp.Regexp
 }
 
-func (c *ColumnDecl) lineMatch(lineIndex int, line *line, delim string) bool {
+func (c *ColumnDecl) lineMatch(lineIndex int, line *line, records []string, delim string) bool {
 	if c.LineIndex != nil {
 		return *c.LineIndex == lineIndex+1 // c.LineIndex is 1 based.
 	}
 	if c.linePatternRegexp != nil {
-		return matchLine(c.linePatternRegexp, line, delim)
+		return matchLine(c.linePatternRegexp, line, records, delim)
 	}
 	return true
 }
 
-func (c *ColumnDecl) lineToColumnValue(line *line) string {
-	if *c.Index < 1 || *c.Index > len(line.record) {
+func (c *ColumnDecl) lineToColumnValue(line *line, records []string) string {
+	if *c.Index < 1 || *c.Index > line.recordNum {
 		return ""
 	}
-	return line.record[*c.Index-1]
+	return records[line.recordStart+*c.Index-1]
 }
 
 const (
@@ -122,26 +122,26 @@ func (r *RecordDecl) rows() int {
 	return *r.Rows
 }
 
-func (r *RecordDecl) matchHeader(line *line, delim string) bool {
+func (r *RecordDecl) matchHeader(line *line, records []string, delim string) bool {
 	if r.headerRegexp == nil {
 		panic(fmt.Sprintf("record '%s' is not header/footer based", r.fqdn))
 	}
-	return matchLine(r.headerRegexp, line, delim)
+	return matchLine(r.headerRegexp, line, records, delim)
 }
 
 // Footer is optional. If not specified, it always matches. Thus for a header/footer record,
 // if the footer isn't specified, it effectively becomes a single-row record matched by header,
 // given that after the header matches a line, matchFooter is called on the same line.
-func (r *RecordDecl) matchFooter(line *line, delim string) bool {
+func (r *RecordDecl) matchFooter(line *line, records []string, delim string) bool {
 	if r.footerRegexp == nil {
 		return true
 	}
-	return matchLine(r.footerRegexp, line, delim)
+	return matchLine(r.footerRegexp, line, records, delim)
 }
 
-func matchLine(re *regexp.Regexp, line *line, delim string) bool {
+func matchLine(re *regexp.Regexp, line *line, records []string, delim string) bool {
 	if line.raw == "" {
-		line.raw = strings.Join(line.record, delim)
+		line.raw = strings.Join(records[line.recordStart:line.recordStart+line.recordNum], delim)
 	}
 	return re.MatchString(line.raw)
 }

diff --git a/extensions/omniv21/fileformat/flatfile/csv/decl_test.go b/extensions/omniv21/fileformat/flatfile/csv/decl_test.go
@@ -13,24 +13,26 @@ import (
 
 func TestColumnDecl_LineMatch(t *testing.T) {
 	// no line_index/line_pattern, always match
-	assert.True(t, (&ColumnDecl{}).lineMatch(0, &line{}, ""))
-	assert.False(t, (&ColumnDecl{LineIndex: testlib.IntPtr(2)}).lineMatch(0, &line{}, ""))
-	assert.True(t, (&ColumnDecl{LineIndex: testlib.IntPtr(2)}).lineMatch(1, &line{}, ""))
+	assert.True(t, (&ColumnDecl{}).lineMatch(0, &line{}, nil, ""))
+	assert.False(t, (&ColumnDecl{LineIndex: testlib.IntPtr(2)}).lineMatch(0, &line{}, nil, ""))
+	assert.True(t, (&ColumnDecl{LineIndex: testlib.IntPtr(2)}).lineMatch(1, &line{}, nil, ""))
 	assert.False(t, (&ColumnDecl{linePatternRegexp: regexp.MustCompile("^ABC.*$")}).
-		lineMatch(0, &line{record: []string{"123", "456"}}, ","))
-	l := &line{record: []string{"ABC", "DEF"}}
+		lineMatch(0, &line{recordStart: 0, recordNum: 2}, []string{"123", "456"}, ","))
+	l := &line{recordStart: 0, recordNum: 2}
 	assert.True(t,
-		(&ColumnDecl{linePatternRegexp: regexp.MustCompile("^ABC\\|D.*$")}).lineMatch(0, l, "|"))
+		(&ColumnDecl{linePatternRegexp: regexp.MustCompile("^ABC\\|D.*$")}).lineMatch(
+			0, l, []string{"ABC", "DEF"}, "|"))
 	assert.Equal(t, "ABC|DEF", l.raw)
 }
 
 func TestColumnDecl_LineToColumnValue(t *testing.T) {
 	assert.Equal(t, "", (&ColumnDecl{Index: testlib.IntPtr(2)}).lineToColumnValue(
-		&line{record: []string{"1"}})) // index out of range
+		&line{recordNum: 1}, nil)) // index out of range
 	assert.Equal(t, "", (&ColumnDecl{Index: testlib.IntPtr(0)}).lineToColumnValue(
-		&line{record: []string{"1"}})) // index out of range
-	assert.Equal(t, "9", (&ColumnDecl{Index: testlib.IntPtr(5)}).lineToColumnValue(
-		&line{record: []string{"1", "3", "5", "7", "9", "11"}})) // in range
+		&line{recordNum: 1}, nil)) // index out of range
+	assert.Equal(t, "6", (&ColumnDecl{Index: testlib.IntPtr(5)}).lineToColumnValue(
+		&line{recordStart: 1, recordNum: 7},                    // "2" .. "8"
+		[]string{"1", "2", "3", "4", "5", "6", "7", "8", "9"})) // in range
 }
 
 func TestRecordDecl(t *testing.T) {
@@ -85,24 +87,25 @@ func TestRecordDecl(t *testing.T) {
 
 	// matchHeader()
 	assert.PanicsWithValue(
-		t, "record 'r1' is not header/footer based", func() { r.matchHeader(&line{}, "") })
+		t, "record 'r1' is not header/footer based", func() { r.matchHeader(&line{}, nil, "") })
 	r.headerRegexp = regexp.MustCompile("^ABC,")
-	assert.False(t, r.matchHeader(&line{}, ","))
-	line := &line{record: []string{"ABC", "EFG"}}
-	assert.True(t, r.matchHeader(line, ","))
+	assert.False(t, r.matchHeader(&line{}, nil, ","))
+	line := &line{recordStart: 1, recordNum: 2}
+	assert.True(t, r.matchHeader(line, []string{"123", "ABC", "EFG"}, ","))
 
 	// matchFooter()
-	assert.True(t, r.matchFooter(line, ","))
+	assert.True(t, r.matchFooter(line, nil, ",")) // if `footer` not specified, always match.
 	r.footerRegexp = regexp.MustCompile("^ABC,EF.*$")
-	assert.True(t, r.matchFooter(line, ","))
+	assert.True(t, r.matchFooter(line, []string{"123", "ABC", "EFG"}, ","))
 }
 
 func TestMatchLine(t *testing.T) {
-	line := &line{record: []string{"1", "2"}}
+	records := []string{"0", "1", "2", "3"}
+	line := &line{recordStart: 1, recordNum: 2} // "1", "2"
 	assert.Equal(t, "", line.raw)
-	assert.False(t, matchLine(regexp.MustCompile("^1\\|2$"), line, ","))
+	assert.False(t, matchLine(regexp.MustCompile("^1\\|2$"), line, records, ","))
 	assert.Equal(t, "1,2", line.raw)
-	assert.True(t, matchLine(regexp.MustCompile("^1,2$"), line, ","))
+	assert.True(t, matchLine(regexp.MustCompile("^1,2$"), line, records, ","))
 }
 
 func TestToFlatFileRecDecls(t *testing.T) {

diff --git a/extensions/omniv21/fileformat/flatfile/csv/format.go b/extensions/omniv21/fileformat/flatfile/csv/format.go
@@ -0,0 +1,95 @@
+package csv
+
+import (
+	"encoding/json"
+	"fmt"
+	"io"
+	"strings"
+
+	"github.com/antchfx/xpath"
+	"github.com/jf-tech/go-corelib/caches"
+	"github.com/jf-tech/go-corelib/strs"
+
+	"github.com/jf-tech/omniparser/errs"
+	"github.com/jf-tech/omniparser/extensions/omniv21/fileformat"
+	"github.com/jf-tech/omniparser/extensions/omniv21/transform"
+	v21validation "github.com/jf-tech/omniparser/extensions/omniv21/validation"
+	"github.com/jf-tech/omniparser/validation"
+)
+
+const (
+	fileFormatCSV = "csv2"
+)
+
+type csvFormat struct {
+	schemaName string
+}
+
+// NewCSVFileFormat creates a FileFormat for 'csv2'.
+func NewCSVFileFormat(schemaName string) fileformat.FileFormat {
+	return &csvFormat{schemaName: schemaName}
+}
+
+type csvFormatRuntime struct {
+	Decl  *FileDecl `json:"file_declaration"`
+	XPath string
+}
+
+func (f *csvFormat) ValidateSchema(
+	format string, schemaContent []byte, finalOutputDecl *transform.Decl) (interface{}, error) {
+	if format != fileFormatCSV {
+		return nil, errs.ErrSchemaNotSupported
+	}
+	err := validation.SchemaValidate(
+		f.schemaName, schemaContent, v21validation.JSONSchemaCSV2FileDeclaration)
+	if err != nil {
+		// err is already context formatted.
+		return nil, err
+	}
+	var runtime csvFormatRuntime
+	_ = json.Unmarshal(schemaContent, &runtime) // JSON schema validation earlier guarantees Unmarshal success.
+	err = f.validateFileDecl(runtime.Decl)
+	if err != nil {
+		// err is already context formatted.
+		return nil, err
+	}
+	if finalOutputDecl == nil {
+		return nil, f.FmtErr("'FINAL_OUTPUT' is missing")
+	}
+	runtime.XPath = strings.TrimSpace(strs.StrPtrOrElse(finalOutputDecl.XPath, ""))
+	if runtime.XPath != "" {
+		_, err := caches.GetXPathExpr(runtime.XPath)
+		if err != nil {
+			return nil, f.FmtErr("'FINAL_OUTPUT.xpath' (value: '%s') is invalid, err: %s",
+				runtime.XPath, err.Error())
+		}
+	}
+	return &runtime, nil
+}
+
+func (f *csvFormat) validateFileDecl(decl *FileDecl) error {
+	err := (&validateCtx{}).validateFileDecl(decl)
+	if err != nil {
+		return f.FmtErr(err.Error())
+	}
+	return err
+}
+
+func (f *csvFormat) CreateFormatReader(
+	name string, r io.Reader, runtime interface{}) (fileformat.FormatReader, error) {
+	rt := runtime.(*csvFormatRuntime)
+	targetXPathExpr, err := func() (*xpath.Expr, error) {
+		if rt.XPath == "" || rt.XPath == "." {
+			return nil, nil
+		}
+		return caches.GetXPathExpr(rt.XPath)
+	}()
+	if err != nil {
+		return nil, f.FmtErr("xpath '%s' on 'FINAL_OUTPUT' is invalid: %s", rt.XPath, err.Error())
+	}
+	return NewReader(name, r, rt.Decl, targetXPathExpr), nil
+}
+
+func (f *csvFormat) FmtErr(format string, args ...interface{}) error {
+	return fmt.Errorf("schema '%s': %s", f.schemaName, fmt.Sprintf(format, args...))
+}