-
Notifications
You must be signed in to change notification settings - Fork 0
/
anonymize-csv.scm
executable file
·107 lines (88 loc) · 3.18 KB
/
anonymize-csv.scm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#!/bin/sh
# -*- scheme -*-
exec guile -e main -s "$0" "$@"
!#
;; Double-Anonymize the trust.csv by replacing keys with
;; index-numbers. This prevents results from evaluations of the trust
;; graph from being applied directly to correlation attacks on Freenet
;; users.
;; TODO: use vhashes instead of regular hash tables.
(use-modules (ice-9 rdelim)
(ice-9 i18n)
(srfi srfi-69) ; hash tables
(srfi srfi-1) ; first, second, third
)
(define (set-add table . elements)
(let add ((elements elements))
(cond
((null? elements)
table)
(else
(hash-table-set! table (car elements) #t)
(add (cdr elements))))))
(define (set-keys table)
(hash-table-keys table))
(define (set-size table)
(hash-table-size table))
(define (set->list-sorted table)
(sort-list (set-keys table) string<?))
(define (make-set)
(make-hash-table))
(define (get-ids port)
(let collect-ids ((ids (make-set)))
(let ((line (read-line port)))
(cond
((eof-object? line)
(set->list-sorted ids))
(else
(let* ((columns (string-split line #\;))
(source (first columns))
(target (second columns)))
(collect-ids (set-add ids source target))))))))
(define (index-ids-fun ids)
(let ((id-to-index (make-hash-table)))
(let fill-table ((ids ids)
(index 0))
(cond ((null? ids)
id-to-index)
(else
(hash-table-set! id-to-index (car ids) index)
(fill-table (cdr ids)
(+ 1 index)))))
(lambda (id) (hash-table-ref id-to-index id))))
(define (check-csv-header port)
(let ((header (read-line port))
(required-header-lowercase "source;target"))
(when (not (string-prefix? required-header-lowercase (string-locale-downcase header)))
(error (format #f "input file must have header '~A' (regardless of case) but has header '~A'" required-header-lowercase header)))))
(define (index-ids-from-file port)
(check-csv-header port)
(let* ((ids (get-ids port))
(id->index (index-ids-fun ids)))
id->index))
(define (anonymize-ids id->index inport outport)
(check-csv-header inport)
(format outport "Source;Target;Weight\n")
(let anonymize ((line (read-line inport)))
(cond
((eof-object? line) #t)
(else
(let* ((columns (string-split line #\;))
(source (id->index (first columns)))
(target (id->index (second columns)))
(weight (third columns)))
(format outport "~A;~A;~A\n" source target weight))
(anonymize (read-line inport))))))
(define (main args)
(let ((infile (if (null? (cdr args))
"trust-deduplicated.csv"
(second args)))
(outfile (if (or (null? (cdr args)) (null? (cdr (cdr args))))
"trust-anonymized.csv"
(third args))))
(let ((id->index (call-with-input-file infile index-ids-from-file))
(inport (open-input-file infile))
(outport (open-output-file outfile)))
(anonymize-ids id->index inport outport)
(close-port inport)
(close-port outport))))