, TLSH> cache = new HashMap<>();
- private TLSHCache() {}
-
- public TLSH getTLSH(BucketOption bo, ChecksumOption co) {
- return cache.computeIfAbsent( new AbstractMap.SimpleEntry<>(bo, co)
- , kv -> new TLSH(kv.getKey(), kv.getValue())
- );
- }
-}
diff --git a/flink-cyber/flink-stellar/src/main/java/org/apache/metron/stellar/common/utils/hashing/tlsh/TLSHConstants.java b/flink-cyber/flink-stellar/src/main/java/org/apache/metron/stellar/common/utils/hashing/tlsh/TLSHConstants.java
new file mode 100644
index 000000000..0957ab17e
--- /dev/null
+++ b/flink-cyber/flink-stellar/src/main/java/org/apache/metron/stellar/common/utils/hashing/tlsh/TLSHConstants.java
@@ -0,0 +1,135 @@
+package org.apache.metron.stellar.common.utils.hashing.tlsh;
+
+import java.nio.charset.StandardCharsets;
+
+public final class TLSHConstants {
+
+ /**
+ * Pearson's table.
+ */
+ public static final int[] PEARSON_TABLE = {1, 87, 49, 12, 176, 178, 102, 166, 121, 193, 6, 84, 249, 230, 44, 163, 14, 197, 213,
+ 181, 161, 85, 218, 80, 64, 239, 24, 226, 236, 142, 38, 200, 110, 177, 104, 103, 141, 253, 255, 50, 77, 101,
+ 81, 18, 45, 96, 31, 222, 25, 107, 190, 70, 86, 237, 240, 34, 72, 242, 20, 214, 244, 227, 149, 235, 97, 234,
+ 57, 22, 60, 250, 82, 175, 208, 5, 127, 199, 111, 62, 135, 248, 174, 169, 211, 58, 66, 154, 106, 195, 245,
+ 171, 17, 187, 182, 179, 0, 243, 132, 56, 148, 75, 128, 133, 158, 100, 130, 126, 91, 13, 153, 246, 216, 219,
+ 119, 68, 223, 78, 83, 88, 201, 99, 122, 11, 92, 32, 136, 114, 52, 10, 138, 30, 48, 183, 156, 35, 61, 26,
+ 143, 74, 251, 94, 129, 162, 63, 152, 170, 7, 115, 167, 241, 206, 3, 150, 55, 59, 151, 220, 90, 53, 23, 131,
+ 125, 173, 15, 238, 79, 95, 89, 16, 105, 137, 225, 224, 217, 160, 37, 123, 118, 73, 2, 157, 46, 116, 9, 145,
+ 134, 228, 207, 212, 202, 215, 69, 229, 27, 188, 67, 124, 168, 252, 42, 4, 29, 108, 21, 247, 19, 205, 39,
+ 203, 233, 40, 186, 147, 198, 192, 155, 33, 164, 191, 98, 204, 165, 180, 117, 76, 140, 36, 210, 172, 41, 54,
+ 159, 8, 185, 232, 113, 196, 231, 47, 146, 120, 51, 65, 28, 144, 254, 221, 93, 189, 194, 139, 112, 43, 71,
+ 109, 184, 209};
+
+ /**
+ * Lookup table for the logs of the length value. The last entry saturates the
+ * logLength at 255.
+ *
+ *
+ * 7 -> 25L means 25 is the highest number for which the log is 7.
+ * Generally speaking for the closed interval [ TOPVAL(n-1)+1 .. TOPVAL(n) ] the
+ * logLength is n.
+ */
+ static final long[] TOPVAL = {/* 0 */ 1, /* 1 */ 2, /* 2 */ 3, /* 3 */ 5, /* 4 */ 7, /* 5 */ 11, /* 6 */ 17,
+ /* 7 */ 25, /* 8 */ 38, /* 9 */ 57, /* 10 */ 86, /* 11 */ 129, /* 12 */ 194, /* 13 */ 291, /* 14 */ 437,
+ /* 15 */ 656, /* 16 */ 854, /* 17 */ 1_110, /* 18 */ 1443, /* 19 */ 1876, /* 20 */ 2439, /* 21 */ 3171,
+ /* 22 */ 3475, /* 23 */ 3823, /* 24 */ 4205, /* 25 */ 4626, /* 26 */ 5088, /* 27 */ 5597, /* 28 */ 6157,
+ /* 29 */ 6772, /* 30 */ 7450, /* 31 */ 8195, /* 32 */ 9014, /* 33 */ 9916, /* 34 */ 10_907, /* 35 */ 11_998,
+ /* 36 */ 13_198, /* 37 */ 14_518, /* 38 */ 15_970, /* 39 */ 17_567, /* 40 */ 19_323, /* 41 */ 21_256,
+ /* 42 */ 23_382, /* 43 */ 25_720, /* 44 */ 28_292, /* 45 */ 31_121, /* 46 */ 34_233, /* 47 */ 37_656,
+ /* 48 */ 41_422, /* 49 */ 45_564, /* 50 */ 50_121, /* 51 */ 55_133, /* 52 */ 60_646, /* 53 */ 66_711,
+ /* 54 */ 73_382, /* 55 */ 80_721, /* 56 */ 88_793, /* 57 */ 97_672, /* 58 */ 107_439, /* 59 */ 118_183,
+ /* 60 */ 130_002, /* 61 */ 143_002, /* 62 */ 157_302, /* 63 */ 173_032, /* 64 */ 190_335, /* 65 */ 209_369,
+ /* 66 */ 230_306, /* 67 */ 253_337, /* 68 */ 278_670, /* 69 */ 306_538, /* 70 */ 337_191, /* 71 */ 370_911,
+ /* 72 */ 408_002, /* 73 */ 448_802, /* 74 */ 493_682, /* 75 */ 543_050, /* 76 */ 597_356, /* 77 */ 657_091,
+ /* 78 */ 722_800, /* 79 */ 795_081, /* 80 */ 874_589, /* 81 */ 962_048, /* 82 */ 1_058_252,
+ /* 83 */ 1_164_078, /* 84 */ 1_280_486, /* 85 */ 1_408_534, /* 86 */ 1_549_388, /* 87 */ 1_704_327,
+ /* 88 */ 1_874_759, /* 89 */ 2_062_236, /* 90 */ 2_268_459, /* 91 */ 2_495_305, /* 92 */ 2_744_836,
+ /* 93 */ 3_019_320, /* 94 */ 3_321_252, /* 95 */ 3_653_374, /* 96 */ 4_018_711, /* 97 */ 4_420_582,
+ /* 98 */ 4_862_641, /* 99 */ 5_348_905, /* 100 */ 5_883_796, /* 101 */ 6_472_176, /* 102 */ 7_119_394,
+ /* 103 */ 7_831_333, /* 104 */ 8_614_467, /* 105 */ 9_475_909, /* 106 */ 10_423_501, /* 107 */ 11_465_851,
+ /* 108 */ 12_612_437, /* 109 */ 13_873_681, /* 110 */ 15_261_050, /* 111 */ 16_787_154,
+ /* 112 */ 18_465_870, /* 113 */ 20_312_458, /* 114 */ 22_343_706, /* 115 */ 24_578_077,
+ /* 116 */ 27_035_886, /* 117 */ 29_739_474, /* 118 */ 32_713_425, /* 119 */ 35_984_770,
+ /* 120 */ 39_583_245, /* 121 */ 43_541_573, /* 122 */ 47_895_730, /* 123 */ 52_685_306,
+ /* 124 */ 57_953_837, /* 125 */ 63_749_221, /* 126 */ 70_124_148, /* 127 */ 77_136_564,
+ /* 128 */ 84_850_228, /* 129 */ 93_335_252, /* 130 */ 102_668_779, /* 131 */ 112_935_659,
+ /* 132 */ 124_229_227, /* 133 */ 136_652_151, /* 134 */ 150_317_384, /* 135 */ 165_349_128,
+ /* 136 */ 181_884_040, /* 137 */ 200_072_456, /* 138 */ 220_079_703, /* 139 */ 242_087_671,
+ /* 140 */ 266_296_456, /* 141 */ 292_926_096, /* 142 */ 322_218_735, /* 143 */ 354_440_623,
+ /* 144 */ 389_884_688, /* 145 */ 428_873_168, /* 146 */ 471_760_495, /* 147 */ 518_936_559,
+ /* 148 */ 570_830_240, /* 149 */ 627_913_311, /* 150 */ 690_704_607, /* 151 */ 759_775_136,
+ /* 152 */ 835_752_671, /* 153 */ 919_327_967, /* 154 */ 1_011_260_767, /* 155 */ 1_112_386_880,
+ /* 156 */ 1_223_623_232, /* 157 */ 1_345_985_727, /* 158 */ 1_480_584_256, /* 159 */ 1_628_642_751,
+ /* 160 */ 1_791_507_135, /* 161 */ 1_970_657_856, /* 162 */ 2_167_723_648L, /* 163 */ 2_384_496_256L,
+ /* 164 */ 2_622_945_920L, /* 165 */ 2_885_240_448L, /* 166 */ 3_173_764_736L, /* 167 */ 3_491_141_248L,
+ /* 168 */ 3_840_255_616L, /* 169 */ 4_224_281_216L, /* 170 */ 4_646_709_504L, /* 171 */ 5_111_380_735L,
+ /* 172 */ 5_622_519_040L, /* 173 */ 6_184_770_816L, /* 174 */ 6_803_248_384L, /* 175 */ 7_483_572_991L,
+ /* 176 */ 8_231_930_623L, /* 177 */ 9_055_123_968L, /* 178 */ 9_960_636_928L, /* 179 */ 10_956_701_183L,
+ /* 180 */ 12_052_370_943L, /* 181 */ 13_257_608_703L, /* 182 */ 14_583_370_240L, /* 183 */ 16_041_708_032L,
+ /* 184 */ 17_645_878_271L, /* 185 */ 19_410_467_839L, /* 186 */ 21_351_515_136L, /* 187 */ 23_486_667_775L,
+ /* 188 */ 25_835_334_655L, /* 189 */ 28_418_870_271L, /* 190 */ 31_260_756_991L, /* 191 */ 34_386_835_455L,
+ /* 192 */ 37_825_517_567L, /* 193 */ 41_608_071_168L, /* 194 */ 45_768_882_175L, /* 195 */ 50_345_768_959L,
+ /* 196 */ 55_380_346_880L, /* 197 */ 60_918_384_640L, /* 198 */ 67_010_226_176L, /* 199 */ 73_711_251_455L,
+ /* 200 */ 81_082_380_287L, /* 201 */ 89_190_617_088L, /* 202 */ 98_109_681_663L, /* 203 */ 107_920_658_432L,
+ /* 204 */ 118_712_725_503L, /* 205 */ 130_584_006_656L, /* 206 */ 143_642_402_816L,
+ /* 207 */ 158_006_648_832L, /* 208 */ 173_807_329_279L, /* 209 */ 191_188_066_303L,
+ /* 210 */ 210_306_867_200L, /* 211 */ 231_337_566_208L, /* 212 */ 254_471_331_839L,
+ /* 213 */ 279_918_460_927L, /* 214 */ 307_910_328_319L, /* 215 */ 338_701_369_343L,
+ /* 216 */ 372_571_521_024L, /* 217 */ 409_827_917_823L, /* 218 */ 450_810_724_351L,
+ /* 219 */ 495_891_791_872L, /* 220 */ 545_481_015_295L, /* 221 */ 600_029_102_079L,
+ /* 222 */ 660_032_028_671L, /* 223 */ 726_035_300_351L, /* 224 */ 798_638_833_663L,
+ /* 225 */ 878_502_772_736L, /* 226 */ 966_353_059_839L, /* 227 */ 1_062_988_382_207L,
+ /* 228 */ 1_169_287_217_151L, /* 229 */ 1_286_216_024_063L, /* 230 */ 1_414_837_633_024L,
+ /* 231 */ 1_556_321_468_416L, /* 232 */ 1_711_953_739_776L, /* 233 */ 1_883_149_107_199L,
+ /* 234 */ 2_071_464_050_688L, /* 235 */ 2_278_610_567_167L, /* 236 */ 2_506_471_636_992L,
+ /* 237 */ 2_757_119_049_728L, /* 238 */ 3_032_831_098_880L, /* 239 */ 3_336_114_143_231L,
+ /* 240 */ 3_669_725_675_520L, /* 241 */ 4_036_698_439_680L, /* 242 */ 4_440_368_349_184L,
+ /* 243 */ 4_884_405_157_887L, /* 244 */ 5_372_846_014_464L, /* 245 */ 5_910_131_113_984L,
+ /* 246 */ 6_501_144_199_168L, /* 247 */ 7_151_258_697_727L, /* 248 */ 7_866_384_908_288L,
+ /* 249 */ 8_653_023_477_760L, /* 250 */ 9_518_326_480_895L, /* 251 */ 10_470_159_810_560L,
+ /* 252 */ 11_517_175_529_472L, /* 253 */ 12_668_893_659_136L, /* 254 */ 13_935_783_182_336L,
+ /* 255 */ /* 15329425519609L */ Long.MAX_VALUE};
+
+ /** The Pearson default hash of 0. */
+ public static final int T0 = 1 /* T[0] */;
+ /** The Pearson default hash of 2. */
+ public static final int T2 = 49 /* T[2] */;
+ /** The Pearson default hash of 3. */
+ public static final int T3 = 12 /* T[3] */;
+ /** The Pearson default hash of 5. */
+ public static final int T5 = 178 /* T[5] */;
+ /** The Pearson default hash of 7. */
+ public static final int T7 = 166 /* T[7] */;
+ /** The Pearson default hash of 11. */
+ public static final int T11 = 84 /* T[11] */;
+ /** The Pearson default hash of 13. */
+ public static final int T13 = 230 /* T[13] */;
+
+ /**
+ * The scaling multiplier for difference scoring.
+ */
+ public static final int DIFF_SCALE = 12;
+
+
+ /** The length threshold for step 1. */
+ public static final int LEN_STEP_1 = 656;
+ /** The log(1.5) constant used in CPP reference implementation for step 1. */
+ public static final double LOG_1_5 = 0.405_465_100D;
+
+ /** The length threshold for step 2. */
+ public static final int LEN_STEP_2 = 3199;
+ /** The adjustment for step 2. */
+ public static final double LEN_ADJ_2 = 8.727_770D;
+ /** The log(1.3) constant used in CPP reference implementation for step 2. */
+ public static final double LOG_1_3 = 0.262_364_260D;
+
+ /** The adjustment for step 3. */
+ public static final double LEN_ADJ_3 = 62.547_200D;
+ /** The log(1.1) constant used in CPP reference implementation for step 3. */
+ public static final double LOG_1_1 = 0.095_310_180D;
+
+ public static final byte[] HEX_ARRAY = "0123456789ABCDEF".getBytes(StandardCharsets.US_ASCII);
+
+ private TLSHConstants(){
+ }
+}
diff --git a/flink-cyber/flink-stellar/src/main/java/org/apache/metron/stellar/common/utils/hashing/tlsh/TLSHHasher.java b/flink-cyber/flink-stellar/src/main/java/org/apache/metron/stellar/common/utils/hashing/tlsh/TLSHHasher.java
index f3f8e4fb3..144688567 100644
--- a/flink-cyber/flink-stellar/src/main/java/org/apache/metron/stellar/common/utils/hashing/tlsh/TLSHHasher.java
+++ b/flink-cyber/flink-stellar/src/main/java/org/apache/metron/stellar/common/utils/hashing/tlsh/TLSHHasher.java
@@ -17,10 +17,7 @@
*/
package org.apache.metron.stellar.common.utils.hashing.tlsh;
-import com.trendmicro.tlsh.BucketOption;
-import com.trendmicro.tlsh.ChecksumOption;
import org.apache.commons.codec.DecoderException;
-import org.apache.commons.codec.EncoderException;
import org.apache.commons.codec.binary.Hex;
import org.apache.metron.stellar.common.utils.ConversionUtils;
import org.apache.metron.stellar.common.utils.SerDeUtils;
@@ -28,152 +25,143 @@
import org.apache.metron.stellar.common.utils.hashing.Hasher;
import java.nio.charset.StandardCharsets;
-import java.security.NoSuchAlgorithmException;
-import java.util.*;
+import java.util.ArrayList;
+import java.util.BitSet;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Random;
+import java.util.Set;
public class TLSHHasher implements Hasher {
- public static final String TLSH_KEY = "tlsh";
- public static final String TLSH_BIN_KEY = "tlsh_bin";
-
- public enum Config implements EnumConfigurable {
- BUCKET_SIZE("bucketSize"),
- CHECKSUM("checksumBytes"),
- HASHES("hashes"),
- FORCE("force")
- ;
- final public String key;
- Config(String key) {
- this.key = key;
+ public static final String TLSH_KEY = "tlsh";
+ public static final String TLSH_BIN_KEY = "tlsh_bin";
+
+ public enum Config implements EnumConfigurable {
+ BUCKET_SIZE("bucketSize"),
+ CHECKSUM("checksumBytes"),
+ HASHES("hashes"),
+ FORCE("force");
+ final public String key;
+
+ Config(String key) {
+ this.key = key;
+ }
+
+ @Override
+ public String getKey() {
+ return key;
+ }
}
+ Integer bucketOption = 128;
+ Integer checksumOption = 1;
+ Boolean force = true;
+ List hashes = new ArrayList<>();
+
+ /**
+ * Returns an encoded string representation of the hash value of the input. It is expected that
+ * this implementation does throw exceptions when the input is null.
+ *
+ * @param o The value to hash.
+ * @return A hash of {@code toHash} that has been encoded.
+ *
+ */
@Override
- public String getKey() {
- return key;
+ public Object getHash(Object o) {
+ TLSHBuilder builder = new TLSHBuilder(TLSHBuilder.CHECKSUM_OPTION.fromVal(checksumOption), TLSHBuilder.BUCKET_OPTION.fromVal(bucketOption));
+ byte[] data;
+ if (o instanceof String) {
+ data = ((String) o).getBytes(StandardCharsets.UTF_8);
+ } else if (o instanceof byte[]) {
+ data = (byte[]) o;
+ } else {
+ data = SerDeUtils.toBytes(o);
+ }
+ try {
+ TLSH tlsh = builder.getTLSH(data);
+ builder.clean();
+ String hash = tlsh.getHash();
+ if (hashes != null && !hashes.isEmpty()) {
+ Map ret = new HashMap<>();
+ ret.put(TLSH_KEY, hash);
+ ret.putAll(bin(hash));
+ return ret;
+ } else {
+ return hash;
+ }
+ } catch (Exception e) {
+ return null;
+ }
}
- }
-
- BucketOption bucketOption = BucketOption.BUCKETS_128;
- ChecksumOption checksumOption = ChecksumOption.CHECKSUM_1B;
- Boolean force = true;
- List hashes = new ArrayList<>();
- /**
- * Returns an encoded string representation of the hash value of the input. It is expected that
- * this implementation does throw exceptions when the input is null.
- *
- * @param o The value to hash.
- * @return A hash of {@code toHash} that has been encoded.
- * @throws EncoderException If unable to encode the hash then this exception occurs.
- * @throws NoSuchAlgorithmException If the supplied algorithm is not known.
- */
- @Override
- public Object getHash(Object o) throws EncoderException, NoSuchAlgorithmException {
- TLSH tlsh = TLSHCache.INSTANCE.get().getTLSH(bucketOption, checksumOption);
- byte[] data = null;
- if (o instanceof String) {
- data = ((String)o).getBytes(StandardCharsets.UTF_8);
- } else if (o instanceof byte[]) {
- data = (byte[])o;
- } else {
- data = SerDeUtils.toBytes(o);
- }
- try {
- String hash = tlsh.apply(data, force);
- if (hashes != null && hashes.size() > 0) {
- Map ret = new HashMap<>();
- ret.put(TLSH_KEY, hash);
- ret.putAll(bin(hash));
+ public Map bin(String hash) throws DecoderException {
+ Random r = new Random(0);
+ byte[] h = Hex.decodeHex(hash.substring(2 * checksumOption).toCharArray());
+ BitSet vector = BitSet.valueOf(h);
+ int n = vector.length();
+ Map ret = new HashMap<>();
+ boolean singleHash = hashes.size() == 1;
+ for (int numHashes : hashes) {
+ BitSet projection = new BitSet();
+ for (int i = 0; i < numHashes; ++i) {
+ int index = r.nextInt(n);
+ projection.set(i, vector.get(index));
+ }
+ String outputHash = numHashes + Hex.encodeHexString(projection.toByteArray());
+ if (singleHash) {
+ ret.put(TLSH_BIN_KEY, outputHash);
+ } else {
+ ret.put(TLSH_BIN_KEY + "_" + numHashes, outputHash);
+ }
+ }
return ret;
- } else {
- return hash;
- }
- } catch (Exception e) {
- return null;
}
- }
- public Map bin(String hash) throws DecoderException {
- Random r = new Random(0);
- byte[] h = Hex.decodeHex(hash.substring(2 * checksumOption.getChecksumLength()).toCharArray());
- BitSet vector = BitSet.valueOf(h);
- int n = vector.length();
- Map ret = new HashMap<>();
- boolean singleHash = hashes.size() == 1;
- for (int numHashes : hashes) {
- BitSet projection = new BitSet();
- for (int i = 0; i < numHashes; ++i) {
- int index = r.nextInt(n);
- projection.set(i, vector.get(index));
- }
- String outputHash = numHashes + Hex.encodeHexString(projection.toByteArray());
- if (singleHash) {
- ret.put(TLSH_BIN_KEY, outputHash);
- } else {
- ret.put(TLSH_BIN_KEY + "_" + numHashes, outputHash);
- }
+ @Override
+ public void configure(Optional