Skip to content

Commit

Permalink
Merge pull request #12 from VIDA-NYU/dev-0.30.1
Browse files Browse the repository at this point in the history
Dev 0.30.1
  • Loading branch information
heikomuller authored Dec 29, 2020
2 parents 42805c3 + d2a29d4 commit 8649715
Show file tree
Hide file tree
Showing 16 changed files with 457 additions and 12 deletions.
34 changes: 34 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,11 @@ Usage:
local-domains
strong-domains
Alternatives
------------
no-expand
columns-as-domains
Explore Results
---------------
export
Expand Down Expand Up @@ -204,6 +209,35 @@ strong-domains
```


#### Alternatives

**No expansion:** Use the `no-expand` option when discovering local domains on the original dataset columns (without expansion). This option will output a columns file in the same format as the `expand-columns` step that can be used as input for the `local-domains` step.

```
$> java -jar /home/user/lib/D4.jar no-expand --help
D4 - Data-Driven Domain Discovery - Version (0.30.1)
no-expand
--eqs=<file> [default: 'compressed-term-index.txt.gz']
--verbose=<boolean> [default: true]
--columns=<file> [default: 'expanded-columns.txt.gz']
```


**Whole column as domain:** Instead of discovering local domains within (expanded) columns there is now an option to treat each unique (expanded) column as a local domain.

```
$> java -jar /home/user/lib/D4.jar columns-as-domains --help
D4 - Data-Driven Domain Discovery - Version (0.30.1)
columns-as-domains
--eqs=<file> [default: 'compressed-term-index.txt.gz']
--columns=<file> [default: 'expanded-columns.txt.gz']
--verbose=<boolean> [default: true]
--localdomains=<file> [default: 'local-domains.txt.gz']
```


### Explore Results

**Export Domains:** The discovered strong domains can be exported as JSON files for exploration. For each domain a separate file will be created in the output directory. The `--sampleSize` parameter controls the maximum number of terms that are included in the result for each equivalence class.
Expand Down
5 changes: 3 additions & 2 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -70,11 +70,12 @@
<archive>
<manifest>
<addClasspath>true</addClasspath>
<!--mainClass>org.opendata.curation.d4.D4</mainClass-->
<mainClass>org.opendata.curation.d4.D4</mainClass>
<!--mainClass>org.opendata.curation.d4.D4Interactive</mainClass-->
<!--mainClass>org.opendata.curation.d4.signature.RobustSignatureGenerator</mainClass-->
<!--mainClass>org.opendata.curation.d4.domain.DomainSetStatsPrinter</mainClass-->
<!--mainClass>org.opendata.curation.d4.evaluate.BestGTLocalMatchPrinter</mainClass-->
<mainClass>org.opendata.curation.d4.evaluate.BestGTLocalMatchWriter</mainClass>
<!--mainClass>org.opendata.curation.d4.evaluate.BestGTLocalMatchWriter</mainClass-->
<!--mainClass>org.opendata.curation.d4.evaluate.PrepareGTFile</mainClass-->
<!--mainClass>org.opendata.curation.d4.export.ExportStrongDomains</mainClass-->
<!--mainClass>org.opendata.db.eq.SimilarTermIndexGenerator</mainClass-->
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
*/
package org.opendata.core.graph;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
Expand Down Expand Up @@ -76,6 +77,7 @@ public synchronized boolean contains(int nodeId) {
}
}

@Override
public synchronized void edge(int sourceId, int targetId) {

int sourceCompId = _componentMap[sourceId];
Expand Down
2 changes: 2 additions & 0 deletions src/main/java/org/opendata/core/set/IDSet.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
*/
package org.opendata.core.set;

import com.google.gson.JsonArray;
import java.math.BigDecimal;
import java.util.List;
import org.opendata.core.object.ObjectFilter;
Expand Down Expand Up @@ -44,4 +45,5 @@ public interface IDSet extends ObjectSet<Integer>, ObjectFilter<Integer> {
public List<Integer> toSortedList();
public IDSet union(IDSet list);
public IDSet union(int id);
public JsonArray toJsonArray();
}
14 changes: 14 additions & 0 deletions src/main/java/org/opendata/core/set/IDSetImpl.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
*/
package org.opendata.core.set;

import com.google.gson.JsonArray;
import com.google.gson.JsonPrimitive;
import java.math.BigDecimal;
import java.util.Collection;
import java.util.Collections;
Expand Down Expand Up @@ -212,6 +214,18 @@ public List<Integer> toSortedList() {
Collections.sort(result);
return result;
}

@Override
public JsonArray toJsonArray() {

JsonArray result = new JsonArray();

for (int eqId : this.toArray()) {
result.add(new JsonPrimitive(eqId));
}

return result;
}

@Override
public String toString() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
*/
package org.opendata.core.set;

import com.google.gson.JsonArray;
import java.math.BigDecimal;
import java.util.Iterator;
import java.util.List;
Expand Down Expand Up @@ -181,6 +182,12 @@ public String toIntString() {
return _values.toIntString();
}

@Override
public JsonArray toJsonArray() {

return _values.toJsonArray();
}

@Override
public List<Integer> toList() {

Expand Down
2 changes: 1 addition & 1 deletion src/main/java/org/opendata/curation/d4/Constants.java
Original file line number Diff line number Diff line change
Expand Up @@ -26,5 +26,5 @@ public final class Constants {

public static final String NAME = "D4 - Data-Driven Domain Discovery";

public static final String VERSION = "0.30.0";
public static final String VERSION = "0.30.1";
}
149 changes: 142 additions & 7 deletions src/main/java/org/opendata/curation/d4/D4.java
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,43 @@ public void columns(
.run(files, threads);
}
}

public void columnsAsDomains(
File eqFile,
File columnsFile,
boolean verbose,
File outputFile
) throws java.io.IOException {

if (verbose) {
System.out.println(
String.format(
"%s\n" +
" --columns=%s\n" +
" --localdomains=%s",
STEP_COLUMN_DOMAINS,
columnsFile.getAbsolutePath(),
outputFile.getAbsolutePath()
)
);
}

DataManager db = new DataManager(new CompressedTermIndexFile(eqFile));

ExpandedColumnIndex columnIndex = new ExpandedColumnIndex();
new ExpandedColumnReader(columnsFile).stream(columnIndex);

new InMemLocalDomainGenerator().columnsAsDomains(
columnIndex,
new DomainWriter(outputFile)
);

if (verbose) {
DomainSetStatsPrinter localStats = new DomainSetStatsPrinter();
new DomainReader(outputFile).stream(localStats);
localStats.print();
}
}

public void eqs(
File inputFile,
Expand Down Expand Up @@ -462,15 +499,47 @@ public void termIndex(
outputFile
);
}

public void writeColumns(
File eqFile,
boolean verbose,
File outputFile
) throws java.io.IOException {

if (verbose) {
System.out.println(
String.format(
"%s\n" +
" --eqs=%s\n" +
" --columns=%s",
STEP_NO_EXPAND,
eqFile.getAbsolutePath(),
outputFile.getAbsolutePath()
)
);
}

DataManager db = new DataManager(new CompressedTermIndexFile(eqFile));

new ParallelColumnExpander().noExpand(db.getColumns(), outputFile);

if (verbose) {
ExpandedColumnStatsWriter colStats = new ExpandedColumnStatsWriter();
new ExpandedColumnReader(outputFile).stream(colStats);
colStats.print();
}
}

/**
* Identifier for different steps in the D4 pipeline.
*/
private static final String STEP_COLUMN_DOMAINS = "columns-as-domains";
private static final String STEP_COMPRESS_TERMINDEX = "eqs";
private static final String STEP_EXPAND_COLUMNS = "expand-columns";
private static final String STEP_EXPORT_DOMAINS = "export";
private static final String STEP_GENERATE_COLUMNS = "columns";
private static final String STEP_LOCAL_DOMAINS = "local-domains";
private static final String STEP_NO_EXPAND = "no-expand";
private static final String STEP_SIGNATURES = "signatures";
private static final String STEP_STRONG_DOMAINS = "strong-domains";
private static final String STEP_TERMINDEX = "term-index";
Expand All @@ -489,6 +558,10 @@ public void termIndex(
" " + STEP_EXPAND_COLUMNS + "\n" +
" " + STEP_LOCAL_DOMAINS + "\n" +
" " + STEP_STRONG_DOMAINS + "\n\n" +
" Alternatives\n" +
" ------------\n" +
" " + STEP_NO_EXPAND + "\n" +
" " + STEP_COLUMN_DOMAINS + "\n\n" +
" Explore Results\n" +
" ---------------\n" +
" " + STEP_EXPORT_DOMAINS + "\n\n" +
Expand Down Expand Up @@ -544,7 +617,7 @@ public static void main(String[] args) {
outputDir
);
} catch (java.lang.InterruptedException | java.io.IOException ex) {
LOGGER.log(Level.SEVERE, "COLUMN FILES", ex);
LOGGER.log(Level.SEVERE, STEP_GENERATE_COLUMNS, ex);
System.exit(-1);
}
} else if (command.equals(STEP_TERMINDEX)) {
Expand Down Expand Up @@ -584,7 +657,7 @@ public static void main(String[] args) {
outputFile
);
} catch (java.lang.InterruptedException | java.io.IOException ex) {
LOGGER.log(Level.SEVERE, "TERM INDEX", ex);
LOGGER.log(Level.SEVERE, STEP_TERMINDEX, ex);
System.exit(-1);
}
} else if (command.equals(STEP_COMPRESS_TERMINDEX)) {
Expand Down Expand Up @@ -612,7 +685,7 @@ public static void main(String[] args) {
outputFile
);
} catch (java.io.IOException ex) {
LOGGER.log(Level.SEVERE, "EQUIVALENCE CLASSES", ex);
LOGGER.log(Level.SEVERE, STEP_COMPRESS_TERMINDEX, ex);
System.exit(-1);
}
} else if (command.equals(STEP_SIGNATURES)) {
Expand Down Expand Up @@ -659,7 +732,7 @@ public static void main(String[] args) {
signatureFile
);
} catch (java.lang.InterruptedException | java.io.IOException ex) {
LOGGER.log(Level.SEVERE, "SIGNATURES", ex);
LOGGER.log(Level.SEVERE, STEP_SIGNATURES, ex);
System.exit(-1);
}
} else if (command.equals(STEP_EXPAND_COLUMNS)) {
Expand Down Expand Up @@ -710,7 +783,7 @@ public static void main(String[] args) {
columnsFile
);
} catch (java.io.IOException ex) {
LOGGER.log(Level.SEVERE, "EXPAND COLUMNS", ex);
LOGGER.log(Level.SEVERE, STEP_EXPAND_COLUMNS, ex);
System.exit(-1);
}
} else if (command.equals(STEP_LOCAL_DOMAINS)) {
Expand Down Expand Up @@ -763,7 +836,7 @@ public static void main(String[] args) {
localDomainFile
);
} catch (java.io.IOException ex) {
LOGGER.log(Level.SEVERE, "LOCAL DOMAINS", ex);
LOGGER.log(Level.SEVERE, STEP_LOCAL_DOMAINS, ex);
System.exit(-1);
}
} else if (command.equals(STEP_STRONG_DOMAINS)) {
Expand Down Expand Up @@ -814,7 +887,69 @@ public static void main(String[] args) {
strongDomainFile
);
} catch (java.lang.InterruptedException | java.io.IOException ex) {
LOGGER.log(Level.SEVERE, "STRONG DOMAINS", ex);
LOGGER.log(Level.SEVERE, STEP_STRONG_DOMAINS, ex);
System.exit(-1);
}
} else if (command.equals(STEP_NO_EXPAND)) {
// ----------------------------------------------------------------
// NO EXPAND
// ----------------------------------------------------------------
CLP params = new CLP(
new Parameter[] {
new Parameter(
"eqs",
"<file> [default: 'compressed-term-index.txt.gz']"
),
new Parameter("verbose", "<boolean> [default: true]"),
new Parameter("columns", "<file> [default: 'expanded-columns.txt.gz']")
},
args
);
File eqFile = params.getAsFile("eqs", "compressed-term-index.txt.gz");
boolean verbose = params.getAsBool("verbose", true);
File columnsFile = params.getAsFile("columns", "expanded-columns.txt.gz");
try {
new D4().writeColumns(
eqFile,
verbose,
columnsFile
);
} catch (java.io.IOException ex) {
LOGGER.log(Level.SEVERE, STEP_NO_EXPAND, ex);
System.exit(-1);
}
} else if (command.equals(STEP_COLUMN_DOMAINS)) {
// ----------------------------------------------------------------
// COLUMN DOMAINS
// ----------------------------------------------------------------
CLP params = new CLP(
new Parameter[] {
new Parameter(
"eqs",
"<file> [default: 'compressed-term-index.txt.gz']"
),
new Parameter("columns", "<file> [default: 'expanded-columns.txt.gz']"),
new Parameter("verbose", "<boolean> [default: true]"),
new Parameter(
"localdomains",
"<file> [default: 'local-domains.txt.gz']"
)
},
args
);
File eqFile = params.getAsFile("eqs", "compressed-term-index.txt.gz");
File columnsFile = params.getAsFile("columns", "expanded-columns.txt.gz");
boolean verbose = params.getAsBool("verbose", true);
File localDomainFile = params.getAsFile("localdomains", "local-domains.txt.gz");
try {
new D4().columnsAsDomains(
eqFile,
columnsFile,
verbose,
localDomainFile
);
} catch (java.io.IOException ex) {
LOGGER.log(Level.SEVERE, STEP_COLUMN_DOMAINS, ex);
System.exit(-1);
}
} else if (command.equals(STEP_EXPORT_DOMAINS)) {
Expand Down
Loading

0 comments on commit 8649715

Please sign in to comment.