Skip to content

Commit d24ee6a

Browse files
authored
Adding csv2 file format. Fixing a major bug in the csv2.reader implementation (#179)
Because we use `encoding/csv.Reader.ReuseRecord`, each call of `csv.Reader.Read()` return value `[]string` is a cached slice. Given we need to potentially cache multiple lines thus multiple calls to `Read()`, what's in the `linesBuf.record` is always duplicate!! We could fix this problem trivially by turning `ReuseRecord` off, but that would incur an allocation cost for vast majority of single-line csv operation. That is completely undesired. So instead, we ourselves cache all the returned strings from (potentially multiple) calls to `csv.Reader.Rdad()` into `reader.records []string` slice, that managing that buffer ourselves, thus practically eliminate the over-allocation problem. Accordingly, in the `reader.linesBuf`, instead of having a `record []string`, we have `recordStart` and `recordNum` to reference into the `reader.records`.
1 parent e9d3be8 commit d24ee6a

File tree

9 files changed

+567
-64
lines changed

9 files changed

+567
-64
lines changed

extensions/omniv21/fileformat/flatfile/csv/.snapshots/TestRead-multiple_records

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
"Children": [
55
{
66
"Children": null,
7-
"Data": "v3",
7+
"Data": "v1",
88
"FirstChild": null,
99
"FormatSpecific": null,
1010
"LastChild": null,
@@ -15,9 +15,9 @@
1515
}
1616
],
1717
"Data": "r1c1",
18-
"FirstChild": "(TextNode 'v3')",
18+
"FirstChild": "(TextNode 'v1')",
1919
"FormatSpecific": null,
20-
"LastChild": "(TextNode 'v3')",
20+
"LastChild": "(TextNode 'v1')",
2121
"NextSibling": "(ElementNode r1c2)",
2222
"Parent": "(ElementNode r1)",
2323
"PrevSibling": null,
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
{
2+
"file_declaration": {
3+
"delimiter": ",",
4+
"records": [
5+
{
6+
"name": "e1",
7+
"type": "record_group",
8+
"is_target": true,
9+
"min": 1,
10+
"max": 1,
11+
"child_records": [
12+
{
13+
"name": "e2",
14+
"max": 5,
15+
"columns": [
16+
{
17+
"name": "c1",
18+
"index": 2
19+
}
20+
]
21+
},
22+
{
23+
"name": "e2",
24+
"header": "^ABC$",
25+
"columns": [
26+
{
27+
"name": "c2",
28+
"index": 1,
29+
"line_pattern": "^H00"
30+
}
31+
]
32+
}
33+
]
34+
}
35+
]
36+
},
37+
"XPath": ".[c1 != 'skip']"
38+
}

extensions/omniv21/fileformat/flatfile/csv/decl.go

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -20,21 +20,21 @@ type ColumnDecl struct {
2020
linePatternRegexp *regexp.Regexp
2121
}
2222

23-
func (c *ColumnDecl) lineMatch(lineIndex int, line *line, delim string) bool {
23+
func (c *ColumnDecl) lineMatch(lineIndex int, line *line, records []string, delim string) bool {
2424
if c.LineIndex != nil {
2525
return *c.LineIndex == lineIndex+1 // c.LineIndex is 1 based.
2626
}
2727
if c.linePatternRegexp != nil {
28-
return matchLine(c.linePatternRegexp, line, delim)
28+
return matchLine(c.linePatternRegexp, line, records, delim)
2929
}
3030
return true
3131
}
3232

33-
func (c *ColumnDecl) lineToColumnValue(line *line) string {
34-
if *c.Index < 1 || *c.Index > len(line.record) {
33+
func (c *ColumnDecl) lineToColumnValue(line *line, records []string) string {
34+
if *c.Index < 1 || *c.Index > line.recordNum {
3535
return ""
3636
}
37-
return line.record[*c.Index-1]
37+
return records[line.recordStart+*c.Index-1]
3838
}
3939

4040
const (
@@ -122,26 +122,26 @@ func (r *RecordDecl) rows() int {
122122
return *r.Rows
123123
}
124124

125-
func (r *RecordDecl) matchHeader(line *line, delim string) bool {
125+
func (r *RecordDecl) matchHeader(line *line, records []string, delim string) bool {
126126
if r.headerRegexp == nil {
127127
panic(fmt.Sprintf("record '%s' is not header/footer based", r.fqdn))
128128
}
129-
return matchLine(r.headerRegexp, line, delim)
129+
return matchLine(r.headerRegexp, line, records, delim)
130130
}
131131

132132
// Footer is optional. If not specified, it always matches. Thus for a header/footer record,
133133
// if the footer isn't specified, it effectively becomes a single-row record matched by header,
134134
// given that after the header matches a line, matchFooter is called on the same line.
135-
func (r *RecordDecl) matchFooter(line *line, delim string) bool {
135+
func (r *RecordDecl) matchFooter(line *line, records []string, delim string) bool {
136136
if r.footerRegexp == nil {
137137
return true
138138
}
139-
return matchLine(r.footerRegexp, line, delim)
139+
return matchLine(r.footerRegexp, line, records, delim)
140140
}
141141

142-
func matchLine(re *regexp.Regexp, line *line, delim string) bool {
142+
func matchLine(re *regexp.Regexp, line *line, records []string, delim string) bool {
143143
if line.raw == "" {
144-
line.raw = strings.Join(line.record, delim)
144+
line.raw = strings.Join(records[line.recordStart:line.recordStart+line.recordNum], delim)
145145
}
146146
return re.MatchString(line.raw)
147147
}

extensions/omniv21/fileformat/flatfile/csv/decl_test.go

Lines changed: 22 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -13,24 +13,26 @@ import (
1313

1414
func TestColumnDecl_LineMatch(t *testing.T) {
1515
// no line_index/line_pattern, always match
16-
assert.True(t, (&ColumnDecl{}).lineMatch(0, &line{}, ""))
17-
assert.False(t, (&ColumnDecl{LineIndex: testlib.IntPtr(2)}).lineMatch(0, &line{}, ""))
18-
assert.True(t, (&ColumnDecl{LineIndex: testlib.IntPtr(2)}).lineMatch(1, &line{}, ""))
16+
assert.True(t, (&ColumnDecl{}).lineMatch(0, &line{}, nil, ""))
17+
assert.False(t, (&ColumnDecl{LineIndex: testlib.IntPtr(2)}).lineMatch(0, &line{}, nil, ""))
18+
assert.True(t, (&ColumnDecl{LineIndex: testlib.IntPtr(2)}).lineMatch(1, &line{}, nil, ""))
1919
assert.False(t, (&ColumnDecl{linePatternRegexp: regexp.MustCompile("^ABC.*$")}).
20-
lineMatch(0, &line{record: []string{"123", "456"}}, ","))
21-
l := &line{record: []string{"ABC", "DEF"}}
20+
lineMatch(0, &line{recordStart: 0, recordNum: 2}, []string{"123", "456"}, ","))
21+
l := &line{recordStart: 0, recordNum: 2}
2222
assert.True(t,
23-
(&ColumnDecl{linePatternRegexp: regexp.MustCompile("^ABC\\|D.*$")}).lineMatch(0, l, "|"))
23+
(&ColumnDecl{linePatternRegexp: regexp.MustCompile("^ABC\\|D.*$")}).lineMatch(
24+
0, l, []string{"ABC", "DEF"}, "|"))
2425
assert.Equal(t, "ABC|DEF", l.raw)
2526
}
2627

2728
func TestColumnDecl_LineToColumnValue(t *testing.T) {
2829
assert.Equal(t, "", (&ColumnDecl{Index: testlib.IntPtr(2)}).lineToColumnValue(
29-
&line{record: []string{"1"}})) // index out of range
30+
&line{recordNum: 1}, nil)) // index out of range
3031
assert.Equal(t, "", (&ColumnDecl{Index: testlib.IntPtr(0)}).lineToColumnValue(
31-
&line{record: []string{"1"}})) // index out of range
32-
assert.Equal(t, "9", (&ColumnDecl{Index: testlib.IntPtr(5)}).lineToColumnValue(
33-
&line{record: []string{"1", "3", "5", "7", "9", "11"}})) // in range
32+
&line{recordNum: 1}, nil)) // index out of range
33+
assert.Equal(t, "6", (&ColumnDecl{Index: testlib.IntPtr(5)}).lineToColumnValue(
34+
&line{recordStart: 1, recordNum: 7}, // "2" .. "8"
35+
[]string{"1", "2", "3", "4", "5", "6", "7", "8", "9"})) // in range
3436
}
3537

3638
func TestRecordDecl(t *testing.T) {
@@ -85,24 +87,25 @@ func TestRecordDecl(t *testing.T) {
8587

8688
// matchHeader()
8789
assert.PanicsWithValue(
88-
t, "record 'r1' is not header/footer based", func() { r.matchHeader(&line{}, "") })
90+
t, "record 'r1' is not header/footer based", func() { r.matchHeader(&line{}, nil, "") })
8991
r.headerRegexp = regexp.MustCompile("^ABC,")
90-
assert.False(t, r.matchHeader(&line{}, ","))
91-
line := &line{record: []string{"ABC", "EFG"}}
92-
assert.True(t, r.matchHeader(line, ","))
92+
assert.False(t, r.matchHeader(&line{}, nil, ","))
93+
line := &line{recordStart: 1, recordNum: 2}
94+
assert.True(t, r.matchHeader(line, []string{"123", "ABC", "EFG"}, ","))
9395

9496
// matchFooter()
95-
assert.True(t, r.matchFooter(line, ","))
97+
assert.True(t, r.matchFooter(line, nil, ",")) // if `footer` not specified, always match.
9698
r.footerRegexp = regexp.MustCompile("^ABC,EF.*$")
97-
assert.True(t, r.matchFooter(line, ","))
99+
assert.True(t, r.matchFooter(line, []string{"123", "ABC", "EFG"}, ","))
98100
}
99101

100102
func TestMatchLine(t *testing.T) {
101-
line := &line{record: []string{"1", "2"}}
103+
records := []string{"0", "1", "2", "3"}
104+
line := &line{recordStart: 1, recordNum: 2} // "1", "2"
102105
assert.Equal(t, "", line.raw)
103-
assert.False(t, matchLine(regexp.MustCompile("^1\\|2$"), line, ","))
106+
assert.False(t, matchLine(regexp.MustCompile("^1\\|2$"), line, records, ","))
104107
assert.Equal(t, "1,2", line.raw)
105-
assert.True(t, matchLine(regexp.MustCompile("^1,2$"), line, ","))
108+
assert.True(t, matchLine(regexp.MustCompile("^1,2$"), line, records, ","))
106109
}
107110

108111
func TestToFlatFileRecDecls(t *testing.T) {
Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
package csv
2+
3+
import (
4+
"encoding/json"
5+
"fmt"
6+
"io"
7+
"strings"
8+
9+
"github.com/antchfx/xpath"
10+
"github.com/jf-tech/go-corelib/caches"
11+
"github.com/jf-tech/go-corelib/strs"
12+
13+
"github.com/jf-tech/omniparser/errs"
14+
"github.com/jf-tech/omniparser/extensions/omniv21/fileformat"
15+
"github.com/jf-tech/omniparser/extensions/omniv21/transform"
16+
v21validation "github.com/jf-tech/omniparser/extensions/omniv21/validation"
17+
"github.com/jf-tech/omniparser/validation"
18+
)
19+
20+
const (
21+
fileFormatCSV = "csv2"
22+
)
23+
24+
type csvFormat struct {
25+
schemaName string
26+
}
27+
28+
// NewCSVFileFormat creates a FileFormat for 'csv2'.
29+
func NewCSVFileFormat(schemaName string) fileformat.FileFormat {
30+
return &csvFormat{schemaName: schemaName}
31+
}
32+
33+
type csvFormatRuntime struct {
34+
Decl *FileDecl `json:"file_declaration"`
35+
XPath string
36+
}
37+
38+
func (f *csvFormat) ValidateSchema(
39+
format string, schemaContent []byte, finalOutputDecl *transform.Decl) (interface{}, error) {
40+
if format != fileFormatCSV {
41+
return nil, errs.ErrSchemaNotSupported
42+
}
43+
err := validation.SchemaValidate(
44+
f.schemaName, schemaContent, v21validation.JSONSchemaCSV2FileDeclaration)
45+
if err != nil {
46+
// err is already context formatted.
47+
return nil, err
48+
}
49+
var runtime csvFormatRuntime
50+
_ = json.Unmarshal(schemaContent, &runtime) // JSON schema validation earlier guarantees Unmarshal success.
51+
err = f.validateFileDecl(runtime.Decl)
52+
if err != nil {
53+
// err is already context formatted.
54+
return nil, err
55+
}
56+
if finalOutputDecl == nil {
57+
return nil, f.FmtErr("'FINAL_OUTPUT' is missing")
58+
}
59+
runtime.XPath = strings.TrimSpace(strs.StrPtrOrElse(finalOutputDecl.XPath, ""))
60+
if runtime.XPath != "" {
61+
_, err := caches.GetXPathExpr(runtime.XPath)
62+
if err != nil {
63+
return nil, f.FmtErr("'FINAL_OUTPUT.xpath' (value: '%s') is invalid, err: %s",
64+
runtime.XPath, err.Error())
65+
}
66+
}
67+
return &runtime, nil
68+
}
69+
70+
func (f *csvFormat) validateFileDecl(decl *FileDecl) error {
71+
err := (&validateCtx{}).validateFileDecl(decl)
72+
if err != nil {
73+
return f.FmtErr(err.Error())
74+
}
75+
return err
76+
}
77+
78+
func (f *csvFormat) CreateFormatReader(
79+
name string, r io.Reader, runtime interface{}) (fileformat.FormatReader, error) {
80+
rt := runtime.(*csvFormatRuntime)
81+
targetXPathExpr, err := func() (*xpath.Expr, error) {
82+
if rt.XPath == "" || rt.XPath == "." {
83+
return nil, nil
84+
}
85+
return caches.GetXPathExpr(rt.XPath)
86+
}()
87+
if err != nil {
88+
return nil, f.FmtErr("xpath '%s' on 'FINAL_OUTPUT' is invalid: %s", rt.XPath, err.Error())
89+
}
90+
return NewReader(name, r, rt.Decl, targetXPathExpr), nil
91+
}
92+
93+
func (f *csvFormat) FmtErr(format string, args ...interface{}) error {
94+
return fmt.Errorf("schema '%s': %s", f.schemaName, fmt.Sprintf(format, args...))
95+
}

0 commit comments

Comments
 (0)