-
Notifications
You must be signed in to change notification settings - Fork 7
/
v2_util.go
125 lines (112 loc) · 3.39 KB
/
v2_util.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
package lens
import (
"errors"
"fmt"
"net/http"
"strings"
"time"
"github.com/RTradeLtd/Lens/v2/engine"
"github.com/RTradeLtd/Lens/v2/logs"
"github.com/RTradeLtd/Lens/v2/models"
)
// magnifyOpts declares configuration for magnification
type magnifyOpts struct {
DisplayName string
Reindex bool
Tags []string
}
func (v *V2) magnify(hash string, opts magnifyOpts) (content string, metadata *models.MetaDataV2, err error) {
if v.se.IsIndexed(hash) && !opts.Reindex {
return "", nil, fmt.Errorf("object '%s' has already been indexed", hash)
}
// set up args
if opts.Tags == nil {
opts.Tags = make([]string, 0)
}
// set up logger
var l = logs.NewProcessLogger(v.l, "magnify", "hash", hash)
var start = time.Now()
defer func() { l.Infow("magnification ended", "duration", time.Since(start)) }()
// retrieve object and detect content type
contents, err := v.px.ExtractContents(hash)
if err != nil {
return "", nil, fmt.Errorf("failed to find content for hash '%s'", hash)
}
contentType := http.DetectContentType(contents)
if contentType == "" {
return "", nil, fmt.Errorf("unknown content type for document '%s'", hash)
}
l.Infow("object retrieved and content type detected",
"content_type", contentType)
// contentType will be in the format of `<content-type>; charset=...`
// we use strings.FieldsFunc to separate the string, and to be able to examine
// the content type
var parsed = strings.FieldsFunc(contentType, func(r rune) bool { return (r == ';') })
if parsed == nil || len(parsed) == 0 {
return "", nil, fmt.Errorf("invalid content type '%s'", contentType)
}
// scrape for content based on content-type
var category models.MimeType
switch parsed[0] {
case "application/pdf":
category = models.MimeTypePDF
text, err := v.oc.Analyze(hash, contents, "pdf")
if err != nil {
return "", nil, err
}
content = text
default:
var parsed2 = strings.FieldsFunc(contentType, func(r rune) bool { return (r == '/') })
if parsed2 == nil || len(parsed2) == 0 {
return "", nil, fmt.Errorf("invalid content type '%s'", contentType)
}
switch parsed2[0] {
case "text":
category = models.MimeTypeDocument
content = string(contents)
case "image":
category = models.MimeTypeImage
keyword, err := v.tf.Analyze(hash, contents)
if err != nil {
l.Warnw("failed to categorize image", "error", err)
return "", nil, errors.New("failed to categorize image")
}
// grab any text in image
text, err := v.oc.Analyze(hash, contents, "image")
if err != nil {
l.Warnw("failed to OCR image", "error", err)
content = keyword
} else {
content = text
}
opts.Tags = append(opts.Tags, keyword)
default:
return "", nil, errors.New("unsupported content type for indexing")
}
}
return content, &models.MetaDataV2{
DisplayName: opts.DisplayName,
MimeType: contentType,
Category: string(category),
Tags: opts.Tags,
}, nil
}
// Store is used to store our collected meta data in a formatted object
func (v *V2) store(hash, content string, md *models.MetaDataV2, reindex bool) error {
return v.se.Index(engine.Document{
Object: &models.ObjectV2{
Hash: hash,
MD: *md,
},
Content: content,
Reindex: reindex,
})
}
// Remove is used to remove an indexed object
func (v *V2) remove(hash string) error {
if !v.se.IsIndexed(hash) {
return fmt.Errorf("object '%s' does not exist", hash)
}
v.se.Remove(hash)
return nil
}