Skip to content

Commit

Permalink
Release 0.1.0 (#7)
Browse files Browse the repository at this point in the history
* Add new REST endpoint to query a list of conflicts for a given file

* Add highlighting of conflicts in the UI

* Fix dockercompose build

* Conflict highlightning works

* Add conflict rate

* Add conflict rate

* Add conflict rate to UI

* Add conflict rate to UI

* Prepare the prod build

* Prepare the prod build

* added jupyter notebook

* added jupyter notebook

added py

* Adding conf res

* Add char result equals()

* Adding conf res

* Adding conf res

* Adding conf res

* Add char result equals()

* refactoring the conf res

* Adding conf res

* Adding conf res

* adding conf res rest api

* added resolve button

* Adding conf res

* improving conf res

* Restoring the default algorithm of conf res

* Adding session storage hook

* UI works

* Minor fixes

* conf res works

* Improve size statistics

* Fixing the dashboard

* Adding a dynamic list of property val distribs for dashboard.

* Fixed a bug with prop val distrib for floats with conflicts

* Dashboard works

* Fixed date type visualization in dashboard

* Fix emtpy input test in Filters

* Fixed a bug with prop val distrib for timestamps with conflicts

* Minor visual improvements

* Highlighting conflicts

* Conflict rate formula update

* UX improvement. Fetch props list.

* Add a list of unique properties

* Improve README.md

---------


Co-authored-by: artur <[email protected]>
  • Loading branch information
artourkin and artur authored Feb 4, 2024
1 parent 06adf43 commit 0311095
Show file tree
Hide file tree
Showing 58 changed files with 70,243 additions and 388 deletions.
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ hs_err_pid*
replay_pid*


**/package-lock.json
**/build/
**/yarn.lock
*.lock
Expand Down
50 changes: 47 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,53 @@
# fitsinn
# FITSInn
Place where your FITS files feel good.
## Purpose

Place where your FITS files feel good
FITSInn is a tool to store and analyse technical metadata extracted by characterisation tools such as [FITS](https://projects.iq.harvard.edu/fits/).
![img.png](docs/img.png)

Content Profiling revisited
The features include:
* Improved user experience through an intuitive UI.
* Running file characterisation using FITS on the uploaded files.
* The original files are not stored.
* Slice and dice:
* Filtering,
* Drill-down,
* Property value aggregations,
* Distribution visualisations,
* Sampling.
* Built-in metadata conflict resolution.
* REST API for workflow automation.


## Installation

### Deployment

The artifacts are released as Docker images. You can install FITSInn using Docker-compose:

```
docker-compose -f docker-compose.yaml up --pull
```

Installation of FITSInn to Docker Swarm or K8S is possible, but is not currently addressed.


### Local build

Building the Docker images from scratch and starting FITSInn is executed via:

```
docker-compose -f docker-compose.dev.yaml up --build
```

File uploading using bash:

bash fileupload.sh http://localhost:8082 ~/rnd/data/govdocs_fits/govdocs1/000/

## Issues

If you have any issue regarding FITSInn, please use [https://github.com/datascience/fitsinn/issues](https://github.com/datascience/fitsinn/issues).

## License

FITSInn is released under MIT license. See the [LICENSE](LICENSE) for details.
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
package rocks.artur.api;

import rocks.artur.domain.FilterCriteria;

import java.util.Map;

public interface GetCollectionStatistics {
Map<String, Object> getSizeStatistics();
Map<String, Double> getStatistics(FilterCriteria filterCriteria);

Double getConflictRate();
}
2 changes: 2 additions & 0 deletions core/src/main/java/rocks/artur/api/GetObjects.java
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,6 @@
public interface GetObjects {
List<PropertiesPerObjectStatistic> getObjects(FilterCriteria filterCriteria);
Iterable<CharacterisationResult> getObject(String filePath);

List<CharacterisationResult> getConflictsFromObject(String filePath);
}
5 changes: 5 additions & 0 deletions core/src/main/java/rocks/artur/api/ResolveConflicts.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
package rocks.artur.api;

public interface ResolveConflicts {
void run();
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import rocks.artur.api.GetCollectionStatistics;
import rocks.artur.domain.CharacterisationResultGateway;
import rocks.artur.domain.FilterCriteria;

import java.util.Map;

Expand All @@ -14,13 +15,9 @@ public GetCollectionStatisticsImpl(CharacterisationResultGateway characterisatio
}

@Override
public Map<String, Object> getSizeStatistics() {
Map<String, Object> sizeStatistics = characterisationResultGateway.getSizeStatistics();
public Map<String, Double> getStatistics(FilterCriteria filterCriteria) {
Map<String, Double> sizeStatistics = characterisationResultGateway.getCollectionStatistics(filterCriteria);
return sizeStatistics;
}

@Override
public Double getConflictRate() {
return 17.0;
}
}
8 changes: 8 additions & 0 deletions core/src/main/java/rocks/artur/api_impl/GetObjectsImpl.java
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,12 @@ public Iterable<CharacterisationResult> getObject(String filePath) {
Iterable<CharacterisationResult> characterisationResultsByFilepath = characterisationResultGateway.getCharacterisationResultsByFilepath(filePath);
return characterisationResultsByFilepath;
}

@Override
public List<CharacterisationResult> getConflictsFromObject(String filePath) {
List<CharacterisationResult> characterisationResultsByFilepath = characterisationResultGateway.getConflictsByFilepath(filePath);
return characterisationResultsByFilepath;
}


}
153 changes: 153 additions & 0 deletions core/src/main/java/rocks/artur/api_impl/ResolveConflictsImpl.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
package rocks.artur.api_impl;

import rocks.artur.api.ResolveConflicts;
import rocks.artur.domain.CharacterisationResult;
import rocks.artur.domain.CharacterisationResultGateway;
import rocks.artur.domain.Entry;

import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.function.Function;
import java.util.stream.Collectors;

public class ResolveConflictsImpl implements ResolveConflicts {


private CharacterisationResultGateway characterisationResultGateway;

public ResolveConflictsImpl(CharacterisationResultGateway characterisationResultGateway) {
this.characterisationResultGateway = characterisationResultGateway;
}

@Override
public void run() {
init();
System.out.println(sourceWeights);
//System.out.println("sum of weights: " + sourceWeights.values().stream().reduce(0d, Double::sum));
updateTruth();
System.out.println("sum of weights: " + sourceWeights.values().stream().reduce(0d, Double::sum));
//System.out.println(truth);
for (int i = 0; i < 3; i++) {
updateWeights();
System.out.println(sourceWeights);
System.out.println("sum of weights: " + sourceWeights.values().stream().reduce(0d, Double::sum));
updateTruth();
//System.out.println(truth);
}

resolveConflicts();
}

private void resolveConflicts() {
truth.entrySet().stream().forEach( entry -> {
Entry key = entry.getKey();
String value = entry.getValue();

List<CharacterisationResult> characterisationResultsByEntry = characterisationResultGateway.getCharacterisationResultsByEntry(key);
for (CharacterisationResult characterisationResult : characterisationResultsByEntry) {
if (!characterisationResult.getValue().equals(value)) {
characterisationResultGateway.delete(characterisationResult);
}
}


});
}

private void updateWeights() {
Map<String, Double> score = sources.stream().collect(Collectors.toMap(
Function.identity(),
s -> 0.0));

Map<String, Double> count = sources.stream().collect(Collectors.toMap(
Function.identity(),
s -> 0.0));


List<Entry> entries = characterisationResultGateway.getEntries();

for (Entry entry : entries) {
List<CharacterisationResult> characterisationResults = characterisationResultGateway.getCharacterisationResultsByEntry(entry);

for (CharacterisationResult characterisationResult : characterisationResults) {

String trueValue = truth.get(entry);

String value = characterisationResult.getValue();
String source = characterisationResult.getSource();
if (value.equals(trueValue)) {
score.put(source, score.getOrDefault(source, 0.0) + 0);
} else {
score.put(source, score.getOrDefault(source, 0.0) + 1);
}
count.put(source, count.getOrDefault(source, 0.0) + 1);
}
}
for (String source : score.keySet()) {
Double countSource = count.getOrDefault(source, 1.0);
if (countSource == 0 ) {
score.put(source, 0d);
} else {
score.put(source, score.get(source) / countSource);
}
}
Double sum = score.values().stream().reduce(0.0, (a, b) -> a + b);

score.replaceAll((s, v) -> score.get(s) / sum);

Optional<Map.Entry<String, Double>> max = score.entrySet().stream().max(Map.Entry.comparingByValue());
if (max.isPresent()) {
Double norm_score = max.get().getValue();
for (String source : score.keySet()) {
double w = score.get(source) / norm_score;
Double weig = score.get(source);
if (w == 0d) {
sourceWeights.put(source,0.00001);
} else {
sourceWeights.put(source, -Math.log(w));
}
}
}
}

private void updateTruth() {
List<Entry> entries = characterisationResultGateway.getEntries();
for (Entry entry : entries) {
List<CharacterisationResult> characterisationResults = characterisationResultGateway.getCharacterisationResultsByEntry(entry);

if (characterisationResults.size() > 0) {
CharacterisationResult firstResult = characterisationResults.get(0);
Map<String, Double> votingScores = new HashMap<>();
for (CharacterisationResult characterisationResult : characterisationResults) {
String source = characterisationResult.getSource();
Double sourceWeight = sourceWeights.get(source);
String value = characterisationResult.getValue();

votingScores.put(value, votingScores.getOrDefault(value, 0.0) + sourceWeight);
}
Optional<Map.Entry<String, Double>> first = votingScores.entrySet().stream().max(Map.Entry.comparingByValue());
if (first.isPresent()) {
String trueValue = first.get().getKey();
truth.put(entry, trueValue);
}
}
}
}

List<String> sources;
Map<String, Double> sourceWeights;
Map<Entry, String> truth;

void init() {

sources = characterisationResultGateway.getSources();
sourceWeights = sources.stream().collect(Collectors.toMap(
Function.identity(),
s -> 1.0 / sources.size()));
truth = new HashMap<>();


}
}
30 changes: 30 additions & 0 deletions core/src/main/java/rocks/artur/domain/CharacterisationResult.java
Original file line number Diff line number Diff line change
Expand Up @@ -87,4 +87,34 @@ public String getFilePath() {
public void setFilePath(String filePath) {
this.filePath = filePath;
}

@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;

CharacterisationResult that = (CharacterisationResult) o;

if (property != that.property) return false;
if ( value != null && !value.equals(that.value) ) return false;
if ( that.value != null && !that.value.equals(value) ) return false;
if (valueType != that.valueType) return false;
if ( source != null && !source.equals(that.source) ) return false;
if ( that.source != null && !that.source.equals(source) ) return false;

if ( filePath != null && !filePath.equals(that.filePath) ) return false;
if ( that.filePath != null && !that.filePath.equals(filePath) ) return false;

return true;
}

@Override
public int hashCode() {
int result = property.hashCode();
result = 31 * result + (value != null ? value.hashCode() : 0);
result = 31 * result + (valueType != null ? valueType.hashCode() : 0);
result = 31 * result + (source != null ? source.hashCode() : 0);
result = 31 * result + filePath.hashCode();
return result;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,20 @@ public interface CharacterisationResultGateway {
*/
List<CharacterisationResult> getCharacterisationResultsByFilepath(String filePath);

Map<String, Object> getSizeStatistics();
List<CharacterisationResult> getCharacterisationResultsByEntry(Entry entry);

List<Entry> getConflictEntries();

List<Entry> getEntries();

/**
* gets a list of characterisation results with conflicts for a given digital object.
*
* @return an iterable of characterisation results.
*/
List<CharacterisationResult> getConflictsByFilepath(String filepath);

Map<String, Double> getCollectionStatistics(FilterCriteria filterCriteria);

List<PropertyValueStatistic> getPropertyValueDistribution(Property property, FilterCriteria<CharacterisationResult> filter);

Expand All @@ -68,4 +81,8 @@ public interface CharacterisationResultGateway {
List<String[]> getSamples(FilterCriteria filterCriteria, SamplingAlgorithms algorithm, List<Property> properties);

void addCharacterisationResults(List<CharacterisationResult> characterisationResults);

double getConflictRate();

void delete(CharacterisationResult characterisationResult);
}
Loading

0 comments on commit 0311095

Please sign in to comment.