Skip to content

Adding csv2 file format. Fixing a major bug in the csv2.reader implementation #179

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Sep 19, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"Children": [
{
"Children": null,
"Data": "v3",
"Data": "v1",
"FirstChild": null,
"FormatSpecific": null,
"LastChild": null,
Expand All @@ -15,9 +15,9 @@
}
],
"Data": "r1c1",
"FirstChild": "(TextNode 'v3')",
"FirstChild": "(TextNode 'v1')",
"FormatSpecific": null,
"LastChild": "(TextNode 'v3')",
"LastChild": "(TextNode 'v1')",
"NextSibling": "(ElementNode r1c2)",
"Parent": "(ElementNode r1)",
"PrevSibling": null,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
{
"file_declaration": {
"delimiter": ",",
"records": [
{
"name": "e1",
"type": "record_group",
"is_target": true,
"min": 1,
"max": 1,
"child_records": [
{
"name": "e2",
"max": 5,
"columns": [
{
"name": "c1",
"index": 2
}
]
},
{
"name": "e2",
"header": "^ABC$",
"columns": [
{
"name": "c2",
"index": 1,
"line_pattern": "^H00"
}
]
}
]
}
]
},
"XPath": ".[c1 != 'skip']"
}
22 changes: 11 additions & 11 deletions extensions/omniv21/fileformat/flatfile/csv/decl.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,21 +20,21 @@ type ColumnDecl struct {
linePatternRegexp *regexp.Regexp
}

func (c *ColumnDecl) lineMatch(lineIndex int, line *line, delim string) bool {
func (c *ColumnDecl) lineMatch(lineIndex int, line *line, records []string, delim string) bool {
if c.LineIndex != nil {
return *c.LineIndex == lineIndex+1 // c.LineIndex is 1 based.
}
if c.linePatternRegexp != nil {
return matchLine(c.linePatternRegexp, line, delim)
return matchLine(c.linePatternRegexp, line, records, delim)
}
return true
}

func (c *ColumnDecl) lineToColumnValue(line *line) string {
if *c.Index < 1 || *c.Index > len(line.record) {
func (c *ColumnDecl) lineToColumnValue(line *line, records []string) string {
if *c.Index < 1 || *c.Index > line.recordNum {
return ""
}
return line.record[*c.Index-1]
return records[line.recordStart+*c.Index-1]
}

const (
Expand Down Expand Up @@ -122,26 +122,26 @@ func (r *RecordDecl) rows() int {
return *r.Rows
}

func (r *RecordDecl) matchHeader(line *line, delim string) bool {
func (r *RecordDecl) matchHeader(line *line, records []string, delim string) bool {
if r.headerRegexp == nil {
panic(fmt.Sprintf("record '%s' is not header/footer based", r.fqdn))
}
return matchLine(r.headerRegexp, line, delim)
return matchLine(r.headerRegexp, line, records, delim)
}

// Footer is optional. If not specified, it always matches. Thus for a header/footer record,
// if the footer isn't specified, it effectively becomes a single-row record matched by header,
// given that after the header matches a line, matchFooter is called on the same line.
func (r *RecordDecl) matchFooter(line *line, delim string) bool {
func (r *RecordDecl) matchFooter(line *line, records []string, delim string) bool {
if r.footerRegexp == nil {
return true
}
return matchLine(r.footerRegexp, line, delim)
return matchLine(r.footerRegexp, line, records, delim)
}

func matchLine(re *regexp.Regexp, line *line, delim string) bool {
func matchLine(re *regexp.Regexp, line *line, records []string, delim string) bool {
if line.raw == "" {
line.raw = strings.Join(line.record, delim)
line.raw = strings.Join(records[line.recordStart:line.recordStart+line.recordNum], delim)
}
return re.MatchString(line.raw)
}
Expand Down
41 changes: 22 additions & 19 deletions extensions/omniv21/fileformat/flatfile/csv/decl_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,24 +13,26 @@ import (

func TestColumnDecl_LineMatch(t *testing.T) {
// no line_index/line_pattern, always match
assert.True(t, (&ColumnDecl{}).lineMatch(0, &line{}, ""))
assert.False(t, (&ColumnDecl{LineIndex: testlib.IntPtr(2)}).lineMatch(0, &line{}, ""))
assert.True(t, (&ColumnDecl{LineIndex: testlib.IntPtr(2)}).lineMatch(1, &line{}, ""))
assert.True(t, (&ColumnDecl{}).lineMatch(0, &line{}, nil, ""))
assert.False(t, (&ColumnDecl{LineIndex: testlib.IntPtr(2)}).lineMatch(0, &line{}, nil, ""))
assert.True(t, (&ColumnDecl{LineIndex: testlib.IntPtr(2)}).lineMatch(1, &line{}, nil, ""))
assert.False(t, (&ColumnDecl{linePatternRegexp: regexp.MustCompile("^ABC.*$")}).
lineMatch(0, &line{record: []string{"123", "456"}}, ","))
l := &line{record: []string{"ABC", "DEF"}}
lineMatch(0, &line{recordStart: 0, recordNum: 2}, []string{"123", "456"}, ","))
l := &line{recordStart: 0, recordNum: 2}
assert.True(t,
(&ColumnDecl{linePatternRegexp: regexp.MustCompile("^ABC\\|D.*$")}).lineMatch(0, l, "|"))
(&ColumnDecl{linePatternRegexp: regexp.MustCompile("^ABC\\|D.*$")}).lineMatch(
0, l, []string{"ABC", "DEF"}, "|"))
assert.Equal(t, "ABC|DEF", l.raw)
}

func TestColumnDecl_LineToColumnValue(t *testing.T) {
assert.Equal(t, "", (&ColumnDecl{Index: testlib.IntPtr(2)}).lineToColumnValue(
&line{record: []string{"1"}})) // index out of range
&line{recordNum: 1}, nil)) // index out of range
assert.Equal(t, "", (&ColumnDecl{Index: testlib.IntPtr(0)}).lineToColumnValue(
&line{record: []string{"1"}})) // index out of range
assert.Equal(t, "9", (&ColumnDecl{Index: testlib.IntPtr(5)}).lineToColumnValue(
&line{record: []string{"1", "3", "5", "7", "9", "11"}})) // in range
&line{recordNum: 1}, nil)) // index out of range
assert.Equal(t, "6", (&ColumnDecl{Index: testlib.IntPtr(5)}).lineToColumnValue(
&line{recordStart: 1, recordNum: 7}, // "2" .. "8"
[]string{"1", "2", "3", "4", "5", "6", "7", "8", "9"})) // in range
}

func TestRecordDecl(t *testing.T) {
Expand Down Expand Up @@ -85,24 +87,25 @@ func TestRecordDecl(t *testing.T) {

// matchHeader()
assert.PanicsWithValue(
t, "record 'r1' is not header/footer based", func() { r.matchHeader(&line{}, "") })
t, "record 'r1' is not header/footer based", func() { r.matchHeader(&line{}, nil, "") })
r.headerRegexp = regexp.MustCompile("^ABC,")
assert.False(t, r.matchHeader(&line{}, ","))
line := &line{record: []string{"ABC", "EFG"}}
assert.True(t, r.matchHeader(line, ","))
assert.False(t, r.matchHeader(&line{}, nil, ","))
line := &line{recordStart: 1, recordNum: 2}
assert.True(t, r.matchHeader(line, []string{"123", "ABC", "EFG"}, ","))

// matchFooter()
assert.True(t, r.matchFooter(line, ","))
assert.True(t, r.matchFooter(line, nil, ",")) // if `footer` not specified, always match.
r.footerRegexp = regexp.MustCompile("^ABC,EF.*$")
assert.True(t, r.matchFooter(line, ","))
assert.True(t, r.matchFooter(line, []string{"123", "ABC", "EFG"}, ","))
}

func TestMatchLine(t *testing.T) {
line := &line{record: []string{"1", "2"}}
records := []string{"0", "1", "2", "3"}
line := &line{recordStart: 1, recordNum: 2} // "1", "2"
assert.Equal(t, "", line.raw)
assert.False(t, matchLine(regexp.MustCompile("^1\\|2$"), line, ","))
assert.False(t, matchLine(regexp.MustCompile("^1\\|2$"), line, records, ","))
assert.Equal(t, "1,2", line.raw)
assert.True(t, matchLine(regexp.MustCompile("^1,2$"), line, ","))
assert.True(t, matchLine(regexp.MustCompile("^1,2$"), line, records, ","))
}

func TestToFlatFileRecDecls(t *testing.T) {
Expand Down
95 changes: 95 additions & 0 deletions extensions/omniv21/fileformat/flatfile/csv/format.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
package csv

import (
"encoding/json"
"fmt"
"io"
"strings"

"github.com/antchfx/xpath"
"github.com/jf-tech/go-corelib/caches"
"github.com/jf-tech/go-corelib/strs"

"github.com/jf-tech/omniparser/errs"
"github.com/jf-tech/omniparser/extensions/omniv21/fileformat"
"github.com/jf-tech/omniparser/extensions/omniv21/transform"
v21validation "github.com/jf-tech/omniparser/extensions/omniv21/validation"
"github.com/jf-tech/omniparser/validation"
)

const (
fileFormatCSV = "csv2"
)

type csvFormat struct {
schemaName string
}

// NewCSVFileFormat creates a FileFormat for 'csv2'.
func NewCSVFileFormat(schemaName string) fileformat.FileFormat {
return &csvFormat{schemaName: schemaName}
}

type csvFormatRuntime struct {
Decl *FileDecl `json:"file_declaration"`
XPath string
}

func (f *csvFormat) ValidateSchema(
format string, schemaContent []byte, finalOutputDecl *transform.Decl) (interface{}, error) {
if format != fileFormatCSV {
return nil, errs.ErrSchemaNotSupported
}
err := validation.SchemaValidate(
f.schemaName, schemaContent, v21validation.JSONSchemaCSV2FileDeclaration)
if err != nil {
// err is already context formatted.
return nil, err
}
var runtime csvFormatRuntime
_ = json.Unmarshal(schemaContent, &runtime) // JSON schema validation earlier guarantees Unmarshal success.
err = f.validateFileDecl(runtime.Decl)
if err != nil {
// err is already context formatted.
return nil, err
}
if finalOutputDecl == nil {
return nil, f.FmtErr("'FINAL_OUTPUT' is missing")
}
runtime.XPath = strings.TrimSpace(strs.StrPtrOrElse(finalOutputDecl.XPath, ""))
if runtime.XPath != "" {
_, err := caches.GetXPathExpr(runtime.XPath)
if err != nil {
return nil, f.FmtErr("'FINAL_OUTPUT.xpath' (value: '%s') is invalid, err: %s",
runtime.XPath, err.Error())
}
}
return &runtime, nil
}

func (f *csvFormat) validateFileDecl(decl *FileDecl) error {
err := (&validateCtx{}).validateFileDecl(decl)
if err != nil {
return f.FmtErr(err.Error())
}
return err
}

func (f *csvFormat) CreateFormatReader(
name string, r io.Reader, runtime interface{}) (fileformat.FormatReader, error) {
rt := runtime.(*csvFormatRuntime)
targetXPathExpr, err := func() (*xpath.Expr, error) {
if rt.XPath == "" || rt.XPath == "." {
return nil, nil
}
return caches.GetXPathExpr(rt.XPath)
}()
if err != nil {
return nil, f.FmtErr("xpath '%s' on 'FINAL_OUTPUT' is invalid: %s", rt.XPath, err.Error())
}
return NewReader(name, r, rt.Decl, targetXPathExpr), nil
}

func (f *csvFormat) FmtErr(format string, args ...interface{}) error {
return fmt.Errorf("schema '%s': %s", f.schemaName, fmt.Sprintf(format, args...))
}
Loading