-
Notifications
You must be signed in to change notification settings - Fork 1
/
dtcheck.go
161 lines (132 loc) · 3.37 KB
/
dtcheck.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
// Copyright (c) 2016 Johann Höchtl
// See LICENSE for license
/*
Package datetimecheck returns information wheather a dataset contains datetime information
*/
package datetimecheck
import (
"bytes"
"encoding/csv"
"io"
"regexp"
"strings"
"github.com/rakyll/magicmime"
"github.com/the42/csvprober"
)
// Check max as many bytes
const Checkupto = 8096
type DateTimeChecker struct {
d *magicmime.Decoder
l int
}
type Occurence struct {
Line int `json:",omitempty"`
Offsets [][]int `json:",omitempty"`
XPath *string `json:",omitempty"`
}
type DateTimeCheckResponse struct {
ContainsDT bool
MimeType *string
CheckType *string
Read int
Occurence []Occurence
}
var metadatadts = regexp.MustCompile("(?i)jahr|monat|tag|datum|zeit|datetime|timestamp")
var dataitemdt = regexp.MustCompile(`(?i)\d{1,2}\.\d{1,2}\.\d{4}|\d{4}-\d{1,2}|\d{2}:\d{2}|jän|jan|feb|märz|apr|mai|jun|jul|aug|sep|okt|nov|dez|19\d\d|20\d\d`)
func (d *DateTimeChecker) ContainsDateTimeBytes(b []byte, mt *string) (*DateTimeCheckResponse, error) {
var dt string
oc := &DateTimeCheckResponse{}
if mt == nil || len(*mt) == 0 {
// No mimetype given, autodetect
r, err := d.d.TypeByBuffer(b)
if err != nil {
return nil, err
}
dt = r
oc.MimeType = &r
} else {
dt = *mt
}
// detect file type to check accordingly
dt = strings.ToLower(dt)
if strings.Contains(dt, "csv") || strings.Contains(dt, "text") {
dt = "csv"
}
oc.CheckType = &dt
switch dt {
case "csv":
//TODO(jh): Check if probe consumes the reader
// We are cloning it, might be unneccessary
b1 := bytes.NewBuffer(b)
b2 := bytes.NewBuffer(b)
csvprober := csvprober.NewProber()
csvproberes, err := csvprober.Probe(b1)
if err != nil {
return nil, err
}
csvreader := csv.NewReader(b2)
csvreader.Comma = csvproberes.CSVprobability[0].Delimiter
csvreader.LazyQuotes = true
var re *regexp.Regexp
var i int
for ; ; i++ {
record, err := csvreader.Read()
if err == io.EOF {
break
}
if err != nil {
return nil, err
}
// check if header or data elements contains a notion of a datetime
for _, v := range record {
if i == 0 {
re = metadatadts
} else {
re = dataitemdt
}
if pos := re.FindAllStringIndex(v, -1); len(pos) > 0 {
oc.Occurence = append(oc.Occurence, Occurence{Line: i + 1, Offsets: pos})
}
}
}
oc.Read = i
}
if len(oc.Occurence) > 0 {
oc.ContainsDT = true
}
return oc, nil
}
func (d *DateTimeChecker) ContainsDateTimeStream(s io.Reader, mt *string) (*DateTimeCheckResponse, error) {
var buf = make([]byte, d.l)
io.ReadFull(s, buf)
return d.ContainsDateTimeBytes(buf, mt)
}
func ContainsDateTimeBytes(b []byte, mt *string) (*DateTimeCheckResponse, error) {
d, err := NewDateTimeChecker(0)
if err != nil {
return nil, err
}
defer d.Close()
return d.ContainsDateTimeBytes(b, mt)
}
func ContainsDatetimeReader(r io.Reader, mt *string) (*DateTimeCheckResponse, error) {
d, err := NewDateTimeChecker(0)
if err != nil {
return nil, err
}
defer d.Close()
var buf = make([]byte, d.l)
io.ReadFull(r, buf)
return d.ContainsDateTimeBytes(buf, mt)
}
func NewDateTimeChecker(limit int) (*DateTimeChecker, error) {
d, err := magicmime.NewDecoder(magicmime.Flag(magicmime.MAGIC_MIME))
if err != nil {
return nil, err
}
dc := &DateTimeChecker{d: d, l: limit}
return dc, nil
}
func (d *DateTimeChecker) Close() {
d.d.Close()
}