diff --git a/src/main/java/nl/peterbloem/motive/MotifModel.java b/src/main/java/nl/peterbloem/motive/MotifModel.java index 65bd7bd..61766ec 100644 --- a/src/main/java/nl/peterbloem/motive/MotifModel.java +++ b/src/main/java/nl/peterbloem/motive/MotifModel.java @@ -8,6 +8,7 @@ import static nl.peterbloem.kit.Functions.logFactorial; import static nl.peterbloem.kit.Functions.max; import static nl.peterbloem.kit.Functions.prefix; +import static nl.peterbloem.kit.Pair.p; import static nl.peterbloem.kit.Series.series; import static org.nodes.motifs.MotifCompressor.MOTIF_SYMBOL; @@ -16,11 +17,15 @@ import java.util.HashMap; import java.util.HashSet; import java.util.LinkedHashSet; +import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; +import java.util.TreeSet; import java.util.concurrent.ExecutorService; +import javax.jws.WebParam.Mode; + import org.nodes.DGraph; import org.nodes.DLink; import org.nodes.DNode; @@ -39,10 +44,11 @@ import org.nodes.models.DegreeSequenceModel; import org.nodes.models.ERSimpleModel; import org.nodes.models.EdgeListModel; +import org.nodes.models.Model; import org.nodes.models.RestrictedToSimple; import org.nodes.models.StructureModel; import org.nodes.models.USequenceEstimator; -import org.nodes.models.DSequenceEstimator.D; +import static org.nodes.models.DSequenceEstimator.D; import org.nodes.util.bootstrap.LogNormalCI; import nl.peterbloem.kit.FrequencyModel; @@ -57,7 +63,7 @@ * Note: we can probably speed up the functions sizeSubbedER and sizeSubbedEL by * implementing it with a loop over motif instances, rather than a loop over the * whole graph. This would make the complexity independent of the size of the - * graph, allowing graphs of arbitrary size to be tackled. + * graph, allowing graphs of arbitrary size to be tackled. * * @author Peter * @@ -134,8 +140,6 @@ public static double size(Graph graph, Graph sub, // nodes in the data bits.add("insertions", log2Factorial(graph.size()) - log2Factorial(subbed.size())); -// bits.print(System.out); - return bits.total(); } @@ -286,6 +290,11 @@ public static List subbedDegrees( rest.add("multi-edges", Functions.prefix(additions.isEmpty() ? 0 : Functions.max(additions))); rest.add("multi-edges", OnlineModel.storeIntegers(additions)); + // * check for any disconnected nodes and add 0s + int expSize = graph.size() - occurrences.size() * (occurrences.get(0).size() - 1); + while(degrees.size() < expSize) + degrees.add(new D(0, 0)); + return degrees; } @@ -417,9 +426,16 @@ public static List subbedDegrees( for(Pair token : instanceToInstance.tokens()) additions.add((int)instanceToInstance.frequency(token) - 1); - rest.add("multi-edges", Functions.prefix(additions.isEmpty() ? 0 : Functions.max(additions))); + rest.add("multi-edges", Functions.prefix( + additions.isEmpty() ? + 0 : Functions.max(additions))); rest.add("multi-edges", OnlineModel.storeIntegers(additions)); + // * check for any disconnected nodes and add 0s + int expSize = graph.size() - occurrences.size() * (occurrences.get(0).size() - 1); + while(result.size() < expSize) + result.add(0); + return result; } @@ -495,6 +511,116 @@ public static double sizeER(UGraph graph, UGraph sub, return bits.total(); } + /** + * A version of the ER model that loops only over the instances. It requires + * the degrees of the graph to be given. + * + * @param graph + * @param degrees + * @param sub + * @param occurrences + * @param resetWiring + * @return + */ + public static double sizeERInst(UGraph graph, UGraph sub, + List> occurrences, boolean resetWiring) + { + FrequencyModel bits = new FrequencyModel(); + + bits.add("sub", erModel.codelength(sub)); + + FrequencyModel> multiEdges = new FrequencyModel>(); + List> rewiring = new LinkedList>(); + + Pair pair = subbedERInstances(graph, sub, occurrences, multiEdges, rewiring); + + // * store the template graph (as a simple graph) + bits.add("subbed", ERSimpleModel.undirected(pair.first(), pair.second(), true)); + + // * store the multi-edges + // - We are storing, for each link, the number of additional edges required + // (so everything's - 1) + double mBits = 0.0; + int max = (int)multiEdges.frequency(multiEdges.maxToken()); + mBits += Functions.prefix(max - 1); + OnlineModel model = new OnlineModel(Series.series(0, max)); + + // - first loop over all rewired edges + for(Pair token : multiEdges.tokens()) + mBits += model.encode((int)multiEdges.frequency(token) - 1); + + bits.add("multi-edges", mBits); + + // * Store the rewiring information + bits.add("wiring", wiringBits(sub, rewiring, resetWiring)); + + // * Store the insertion order, to preserve the precise ordering of the + // nodes in the data + int subbedSize = graph.size() - (sub.size() - 1) * occurrences.size(); + + bits.add("insertions", log2Factorial(graph.size()) - log2Factorial(subbedSize)); + bits.add("labels", Functions.prefix(occurrences.size()) + log2Choose(occurrences.size(), subbedSize)); + + // bits.print(System.out); + + return bits.total(); + } + + /** + * A version of the EL model that loops only over the instances. It requires + * the degrees of the graph to be given. + * + * @param graph + * @param degrees + * @param sub + * @param occurrences + * @param resetWiring + * @return + */ + public static double sizeERInst(DGraph graph, DGraph sub, + List> occurrences, boolean resetWiring) + { + FrequencyModel bits = new FrequencyModel(); + + bits.add("sub", erModel.codelength(sub)); + + FrequencyModel> multiEdges = new FrequencyModel>(); + List> rewiring = new LinkedList>(); + + Pair pair = subbedERInstances(graph, sub, occurrences, multiEdges, rewiring); + + // * store the template graph (as a simple graph) + bits.add("subbed", ERSimpleModel.directed(pair.first(), pair.second(), true)); + + // * store the multi-edges + // - We are storing, for each link, the number of additional edges required + // (so everything's - 1) + double mBits = 0.0; + int max = (int)multiEdges.frequency(multiEdges.maxToken()); + mBits += Functions.prefix(max - 1); + OnlineModel model = new OnlineModel(Series.series(0, max)); + + // - loop over all rewired edges + for(Pair token : multiEdges.tokens()) + mBits += model.encode((int)multiEdges.frequency(token) - 1); + + bits.add("multi-edges", mBits); + + // * Store the rewiring information + bits.add("wiring", wiringBits(sub, rewiring, resetWiring)); + + // * Store the insertion order, to preserve the precise ordering of the + // nodes in the data + int subbedSize = graph.size() - (sub.size() - 1) * occurrences.size(); + + bits.add("insertions", log2Factorial(graph.size()) - log2Factorial(subbedSize)); + bits.add("labels", Functions.prefix(occurrences.size()) + log2Choose(occurrences.size(), subbedSize)); + + // bits.print(System.out); + + return bits.total(); + } + private static void sizeSubbedER(DGraph graph, DGraph sub, List> occurrences, FrequencyModel bits) { @@ -558,10 +684,7 @@ private static void sizeSubbedER(DGraph graph, DGraph sub, } // * size of the subbed graph under the binomial compressor - double n = subbedSize; - double t = n * n - n; - - bits.add("subbed", Functions.prefix((int)n) + Functions.log2(t) + log2Choose(subbedLinks, t)); + bits.add("subbed", ERSimpleModel.directed(subbedSize, subbedLinks, true)); List additions = new ArrayList(graph.size()); for(Pair token : nodeToInstance.tokens()) @@ -570,7 +693,7 @@ private static void sizeSubbedER(DGraph graph, DGraph sub, additions.add((int)instanceToNode.frequency(token) - 1); for(Pair token : instanceToInstance.tokens()) additions.add((int)instanceToInstance.frequency(token) - 1); - + bits.add("multiple-edges", Functions.prefix(additions.isEmpty() ? 0 : max(additions))); bits.add("multiple-edges", OnlineModel.storeIntegers(additions)); } @@ -632,11 +755,8 @@ private static void sizeSubbedER(UGraph graph, UGraph sub, } // * size of the subbed graph under the binomial compressor - double n = subbedSize; - double t = (n * n - n)/2; - - bits.add("subbed", Functions.prefix((int)n) + Functions.log2(t) + log2Choose(subbedLinks, t)); - + bits.add("subbed", ERSimpleModel.undirected(subbedSize, subbedLinks, true)); + List additions = new ArrayList(graph.size()); for(Pair token : nodeToInstance.tokens()) additions.add((int)nodeToInstance.frequency(token) - 1); @@ -675,8 +795,7 @@ public static double sizeEL(DGraph graph, DGraph sub, bits.add("insertions", log2Factorial(graph.size()) - log2Factorial(subbedSize)); bits.add("labels", Functions.prefix(occurrences.size()) + log2Choose(occurrences.size(), subbedSize)); - if(occurrences.size() == 0) - bits.print(System.out); + // bits.print(System.out); return bits.total(); } @@ -699,6 +818,122 @@ public static double sizeEL(UGraph graph, UGraph sub, int subbedSize = graph.size() - (sub.size() - 1) * occurrences.size(); bits.add("insertions", log2Factorial(graph.size()) - log2Factorial(subbedSize)); bits.add("labels", Functions.prefix(occurrences.size()) + log2Choose(occurrences.size(), subbedSize)); + + // bits.print(System.out); + + return bits.total(); + } + + /** + * A version of the EL model that loops only over the instances. It requires + * the degrees of the graph to be given. + * + * @param graph + * @param degrees + * @param sub + * @param occurrences + * @param resetWiring + * @return + */ + public static double sizeEL(DGraph graph, List degrees, DGraph sub, + List> occurrences, boolean resetWiring) + { + FrequencyModel bits = new FrequencyModel(); + + bits.add("sub", elModel.codelength(sub)); + + FrequencyModel> multiEdges = new FrequencyModel>(); + List> rewiring = new LinkedList>(); + + List sDegrees = subbedDegrees(graph, degrees, occurrences, multiEdges, rewiring); + + // * store the template graph (as a simple graph) + bits.add("subbed", EdgeListModel.directed(sDegrees, Prior.COMPLETE)); + + // * store the multi-edges + // - We are storing, for each link, the number of additional edges required + // (so everything's - 1) + double mBits = 0.0; + int max = (int)multiEdges.frequency(multiEdges.maxToken()); + mBits += Functions.prefix(max - 1); + OnlineModel model = new OnlineModel(Series.series(0, max)); + + // - loop over all rewired edges + for(Pair token : multiEdges.tokens()) + mBits += model.encode((int)multiEdges.frequency(token) - 1); + + bits.add("multi-edges", mBits); + + // * Store the rewiring information + bits.add("wiring", wiringBits(sub, rewiring, resetWiring)); + + // * Store the insertion order, to preserve the precise ordering of the + // nodes in the data + int subbedSize = graph.size() - (sub.size() - 1) * occurrences.size(); + + assert(sDegrees.size() == subbedSize); + + bits.add("insertions", log2Factorial(graph.size()) - log2Factorial(subbedSize)); + bits.add("labels", Functions.prefix(occurrences.size()) + log2Choose(occurrences.size(), subbedSize)); + + // bits.print(System.out); + + return bits.total(); + } + + /** + * A version of the EL model that loops only over the instances. It requires + * the degrees of the graph to be given. + * + * @param graph + * @param degrees + * @param sub + * @param occurrences + * @param resetWiring + * @return + */ + public static double sizeEL(UGraph graph, List degrees, UGraph sub, + List> occurrences, boolean resetWiring) + { + FrequencyModel bits = new FrequencyModel(); + + bits.add("sub", elModel.codelength(sub)); + + FrequencyModel> multiEdges = new FrequencyModel>(); + List> rewiring = new LinkedList>(); + + List sDegrees = subbedDegrees(graph, degrees, occurrences, multiEdges, rewiring); + + // * store the template graph (as a simple graph) + bits.add("subbed", EdgeListModel.undirected(sDegrees, Prior.COMPLETE)); + + // * store the multi-edges + // - We are storing, for each link, the number of additional edges required + // (so everything's - 1) + double mBits = 0.0; + int max = (int)multiEdges.frequency(multiEdges.maxToken()); + mBits += Functions.prefix(max - 1); + OnlineModel model = new OnlineModel(Series.series(0, max)); + + // - first loop over all rewired edges + for(Pair token : multiEdges.tokens()) + mBits += model.encode((int)multiEdges.frequency(token) - 1); + + bits.add("multi-edges", mBits); + + // * Store the rewiring information + bits.add("wiring", wiringBits(sub, rewiring, resetWiring)); + + // * Store the insertion order, to preserve the precise ordering of the + // nodes in the data + int subbedSize = graph.size() - (sub.size() - 1) * occurrences.size(); + + assert(sDegrees.size() == subbedSize); + + bits.add("insertions", log2Factorial(graph.size()) - log2Factorial(subbedSize)); + bits.add("labels", Functions.prefix(occurrences.size()) + log2Choose(occurrences.size(), subbedSize)); + + // bits.print(System.out); return bits.total(); } @@ -929,4 +1164,382 @@ public static UGraph subbedGraph( return copy; } + + /** + * Computes the degree sequence of the template graph. This method loops only + * over the list of occurrences, making it faster for large graphs with + * few occurrences. + * + * @param graph + * @param degrees + * @param sub + * @param occurrences + * @param multiEdges An empty frequencymodel receiving how often certain edges in + * the template graph should be repeated (one occurrence in the fm no repeats). + * For performance reasons, the actual indices refer to the old graph, not + * the template graph. + * @param rewiring An empty list, receiving the sequence of rewiring integers. + * @return + */ + public static List subbedDegrees( + UGraph graph, List degrees, + List> occurrences, + FrequencyModel> multiEdges, + List> rewiring) + { + + List subbedDegrees = new ArrayList(degrees); + + // * Which nodes have been mapped to which instance node + Map map = new HashMap(); + // * Links that have been rewired. We store the old link (ie. the left + // and right indices in the original graph). + Set> rewLinks = + new LinkedHashSet>(); + + for(List occurrence : occurrences) + { + + List rw = new LinkedList(); + // * Remove the occurrence (first node becomes instance node) + for(int index : occurrence.subList(1, occurrence.size())) + subbedDegrees.set(index, -1); + subbedDegrees.set(occurrence.get(0), 0); + + for(int index : occurrence) + map.put(index, occurrence.get(0)); + + // * Remove all links linking into an occurrence + for(int i : series(occurrence.size())) + { + int index = occurrence.get(i); + for(UNode node : graph.get(index).neighbors()) + if(! occurrence.contains(node.index())) + { + subbedDegrees.set(node.index(), + subbedDegrees.get(node.index()) - 1); + + rewLinks.add(ordered(index, node.index())); + + rw.add(i); + } + } + + rewiring.add(rw); + } + + // * set occurrence nodes back to 0 + for(List occurrence : occurrences) + subbedDegrees.set(occurrence.get(0), 0); + + // * convert the rewritten links to new indices, and build a + // frequencymodel + for(Pair link : rewLinks) + { + int f = link.first(), s = link.second(); + int a = map.containsKey(f) ? map.get(f) : f; + int b = map.containsKey(s) ? map.get(s) : s; + + multiEdges.add(ordered(a, b)); + } + + // * Add each rewritten link _once_ + for(Pair link : multiEdges.tokens()) + { + subbedDegrees.set(link.first(), subbedDegrees.get(link.first()) + 1); + subbedDegrees.set(link.second(), subbedDegrees.get(link.second()) + 1); + } + + List res = new ArrayList( + graph.size() - occurrences.size() * (occurrences.get(0).size() - 1)); + for(int degree : subbedDegrees) + if(degree >= 0) + res.add(degree); + + return res; + } + + /** + * Computes the degree sequence of the template graph. This method loops only + * over the list of occurrences, making it faster for large graphs with + * few occurrences. + * + * @param graph + * @param degrees + * @param sub + * @param occurrences + * @param multiEdges An empty frequencymodel receiving how often certain edges in + * the template graph should be repeated (one occurrence in the fm no repeats). + * For performance reasons, the actual indices refer to the old graph, not + * the template graph. + * @param rewiring An empty list, receiving the sequence of rewiring integers. + * @return + */ + public static List subbedDegrees( + DGraph graph, List degrees, + List> occurrences, + FrequencyModel> multiEdges, + List> rewiring) + { + + List subbedDegrees = new ArrayList(degrees.size()); + for(D degree : degrees) + subbedDegrees.add(new D(degree.in(), degree.out())); + + // * Which nodes have been mapped to which instance node + Map map = new HashMap(); + // * Links that have been rewired. We store the old link (ie. the left + // and right indices in the original graph). + Set> rewLinks = + new LinkedHashSet>(); + + for(List occurrence : occurrences) + { + + List rw = new LinkedList(); + // * Remove the occurrence (first node becomes instance node) + for(int index : occurrence.subList(1, occurrence.size())) + subbedDegrees.set(index, null); + subbedDegrees.set(occurrence.get(0), new D(0, 0)); + + for(int index : occurrence) + map.put(index, occurrence.get(0)); + + // * Remove all links linking into an occurrence + for(int i : series(occurrence.size())) + { + int index = occurrence.get(i); + for(DNode node : graph.get(index).out()) + if(! occurrence.contains(node.index())) + { + D old = subbedDegrees.get(node.index()); + if(old != null) + subbedDegrees.set(node.index(), new D(old.in() - 1, old.out())); + + rewLinks.add(Pair.p(index, node.index())); + + rw.add(i); + } + + for(DNode node : graph.get(index).in()) + if(! occurrence.contains(node.index())) + { + D old = subbedDegrees.get(node.index()); + if(old != null) + subbedDegrees.set(node.index(), new D(old.in(), old.out() - 1)); + + rewLinks.add(Pair.p(node.index(), index)); + + rw.add(i); + } + } + + rewiring.add(rw); + } + + // * set occurrence nodes back to 0 + for(List occurrence : occurrences) + subbedDegrees.set(occurrence.get(0), new D(0, 0)); + + // * convert the rewritten links to new indices, and build a + // frequencymodel + for(Pair link : rewLinks) + { + int f = link.first(), s = link.second(); + int a = map.containsKey(f) ? map.get(f) : f; + int b = map.containsKey(s) ? map.get(s) : s; + + multiEdges.add(Pair.p(a, b)); + } + + // * Add each rewritten link _once_ + for(Pair link : multiEdges.tokens()) + { + D old; + old = subbedDegrees.get(link.first()); + subbedDegrees.set(link.first(), new D(old.in(), old.out() + 1)); + + old = subbedDegrees.get(link.second()); + subbedDegrees.set(link.second(), new D(old.in() + 1, old.out())); + } + + List res = new ArrayList( + graph.size() - occurrences.size() * (occurrences.get(0).size() - 1)); + + for(D degree : subbedDegrees) + if(degree != null) + res.add(degree); + + + return res; + } + + + /** + * Computes the size and number of links in the template graph, by looping + * only over the instances. This method should be fast for large graphs with + * few instances + * + * @param graph + * @param sub + * @param occurrences + * @param multiEdges An empty frequencymodel receiving how often certain edges in + * the template graph should be repeated (one occurrence in the fm no repeats). + * For performance reasons, the actual indices refer to the old graph, not + * the template graph. + * @param rewiring An empty list, receiving the sequence of rewiring integers. + * @return + */ + public static Pair subbedERInstances( + UGraph graph, UGraph sub, List> occurrences, + FrequencyModel> multiEdges, + List> rewiring) + { + int subbedSize = graph.size() - occurrences.size() * (occurrences.get(0).size() - 1); + int subbedNumLinks = graph.numLinks() - sub.numLinks() * occurrences.size(); + // - we still need to remove multiple links from subbedNumLinks + + // * Which nodes have been mapped to which instance node + Map map = new HashMap(); + // * Links that have been rewired. We store the old link (ie. the left + // and right indices in the original graph). + Set> rewLinks = + new LinkedHashSet>(); + + for(List occurrence : occurrences) + { + List rw = new LinkedList(); + for(int index : occurrence) + map.put(index, occurrence.get(0)); + + // * Remove all links linking into an occurrence + for(int i : series(occurrence.size())) + { + int index = occurrence.get(i); + for(UNode node : graph.get(index).neighbors()) + if(! occurrence.contains(node.index())) + { + rewLinks.add(ordered(index, node.index())); + rw.add(i); + } + } + + rewiring.add(rw); + } + + subbedNumLinks -= rewLinks.size(); + + // * convert the rewritten links to new indices, and build a + // frequencymodel + for(Pair link : rewLinks) + { + int f = link.first(), s = link.second(); + int a = map.containsKey(f) ? map.get(f) : f; + int b = map.containsKey(s) ? map.get(s) : s; + + multiEdges.add(ordered(a, b)); + } + + // * Add each rewritten link _once_ + for(Pair link : multiEdges.tokens()) + subbedNumLinks ++; + + return Pair.p(subbedSize, subbedNumLinks); + } + + /** + * Computes the size and number of links in the template graph, by looping + * only over the instances. This method should be fast for large graphs with + * few instances + * + * @param graph + * @param sub + * @param occurrences + * @param multiEdges An empty frequencymodel receiving how often certain edges in + * the template graph should be repeated (one occurrence in the fm no repeats). + * For performance reasons, the actual indices refer to the old graph, not + * the template graph. + * @param rewiring An empty list, receiving the sequence of rewiring integers. + * @return + */ + public static Pair subbedERInstances( + DGraph graph, DGraph sub, List> occurrences, + FrequencyModel> multiEdges, + List> rewiring) + { + int subbedSize = graph.size() - occurrences.size() * (occurrences.get(0).size() - 1); + int subbedNumLinks = graph.numLinks() - sub.numLinks() * occurrences.size(); + // - we still need to remove multiple links from subbedNumLinks + + // * Which nodes have been mapped to which instance node + Map map = new HashMap(); + // * Links that have been rewired. We store the old link (ie. the left + // and right indices in the original graph). + Set> rewLinks = + new LinkedHashSet>(); + + for(List occurrence : occurrences) + { + List rw = new LinkedList(); + for(int index : occurrence) + map.put(index, occurrence.get(0)); + + // * Remove all links linking into an occurrence + for(int i : series(occurrence.size())) + { + int index = occurrence.get(i); + for(DNode node : graph.get(index).out()) + if(! occurrence.contains(node.index())) + { + rewLinks.add(p(index, node.index())); + + rw.add(i); + } + + for(DNode node : graph.get(index).in()) + if(! occurrence.contains(node.index())) + { + rewLinks.add(p(node.index(), index)); + + rw.add(i); + } + } + + rewiring.add(rw); + } + + subbedNumLinks -= rewLinks.size(); + + // * convert the rewritten links to new indices, and build a + // frequencymodel + for(Pair link : rewLinks) + { + int f = link.first(), s = link.second(); + int a = map.containsKey(f) ? map.get(f) : f; + int b = map.containsKey(s) ? map.get(s) : s; + + multiEdges.add(p(a, b)); + } + + // * Add each rewritten link _once_ + for(Pair link : multiEdges.tokens()) + subbedNumLinks ++; + + return Pair.p(subbedSize, subbedNumLinks); + } + + /** + * Returns a pair of ordered integer (ie. the smallest integer always takes + * the first position). + * + * @param i1 + * @param i2 + * @return + */ + private static Pair ordered(int i1, int i2) + { + if(i1 <= i2) + return new Pair(i1, i2); + return new Pair(i2, i1); + } } diff --git a/src/main/java/nl/peterbloem/motive/MotifSearchModel.java b/src/main/java/nl/peterbloem/motive/MotifSearchModel.java index 746cf5d..db26574 100644 --- a/src/main/java/nl/peterbloem/motive/MotifSearchModel.java +++ b/src/main/java/nl/peterbloem/motive/MotifSearchModel.java @@ -8,6 +8,8 @@ import org.nodes.DGraph; import org.nodes.Graph; import org.nodes.UGraph; +import org.nodes.models.DSequenceEstimator; +import static org.nodes.models.DSequenceEstimator.D; import org.nodes.models.StructureModel; import org.nodes.util.Fibonacci; @@ -112,6 +114,74 @@ public double size(Graph graph, Graph sub, return find.size(); } + public static double sizeELInst(UGraph graph, final List degrees, UGraph sub, List> occurrences, boolean resetWiring, int depth) + { + Function> function = new Function>() + { + public double size(UGraph graph, UGraph sub, + List> occurrences, boolean resetWiring) + { + return MotifModel.sizeEL(graph, degrees, sub, occurrences, resetWiring); + } + }; + + FindPhi> find + = new FindPhi>(graph, sub, occurrences, resetWiring, depth, function); + + return find.size(); + } + + public static double sizeELInst(DGraph graph, final List degrees, DGraph sub, List> occurrences, boolean resetWiring, int depth) + { + Function> function = new Function>() + { + public double size(DGraph graph, DGraph sub, + List> occurrences, boolean resetWiring) + { + return MotifModel.sizeEL(graph, degrees, sub, occurrences, resetWiring); + } + }; + + FindPhi> find + = new FindPhi>(graph, sub, occurrences, resetWiring, depth, function); + + return find.size(); + } + + public static double sizeERInst(UGraph graph, UGraph sub, List> occurrences, boolean resetWiring, int depth) + { + Function> function = new Function>() + { + public double size(UGraph graph, UGraph sub, + List> occurrences, boolean resetWiring) + { + return MotifModel.sizeERInst(graph,sub, occurrences, resetWiring); + } + }; + + FindPhi> find + = new FindPhi>(graph, sub, occurrences, resetWiring, depth, function); + + return find.size(); + } + + public static double sizeERInst(DGraph graph, DGraph sub, List> occurrences, boolean resetWiring, int depth) + { + Function> function = new Function>() + { + public double size(DGraph graph, DGraph sub, + List> occurrences, boolean resetWiring) + { + return MotifModel.sizeERInst(graph, sub, occurrences, resetWiring); + } + }; + + FindPhi> find + = new FindPhi>(graph, sub, occurrences, resetWiring, depth, function); + + return find.size(); + } + private static interface Function> { public double size(G graph, G sub, List> occurrences, boolean resetWiring); } diff --git a/src/main/java/nl/peterbloem/motive/exec/Compare.java b/src/main/java/nl/peterbloem/motive/exec/Compare.java index e919829..c9e169b 100644 --- a/src/main/java/nl/peterbloem/motive/exec/Compare.java +++ b/src/main/java/nl/peterbloem/motive/exec/Compare.java @@ -158,20 +158,26 @@ public static enum NullModel{ER, EDGELIST, BETA} */ private boolean resets = true; + /** + * Relative number of available threads devoted to sampling. + */ + public double mix = 0.66; + public void main() throws IOException { // * set up thread pools - double mix = 0.666; // relative number of threads devoted to sampling // - concurrent threads for sampling int sThreads = Math.max(1, (int)(Global.numThreads() * mix)); + if((sThreads >= Global.numThreads()) && (sThreads > 1)) + sThreads = Global.numThreads() - 1; // - concurrent threads for computing scores int mThreads = Math.max(1, Global.numThreads() - sThreads); - Global.log().info(sThreads + " for sampling, " + mThreads + " for computing motif scores."); + Global.log().info("Concurrent threads: " + sThreads + " for sampling, " + mThreads + " for computing motif scores."); ExecutorService samplesExecutor = Executors.newFixedThreadPool(sThreads); - ExecutorService motifsExecutor = Executors.newFixedThreadPool(sThreads); + ExecutorService motifsExecutor = Executors.newFixedThreadPool(mThreads); MotifModel.setExecutor(samplesExecutor); @@ -244,6 +250,8 @@ public void main() throws IOException subs = subsAll; frequencies = frequenciesAll; } + subsAll = null; + frequenciesAll = null; final Map, Double> factorsERMap = new ConcurrentHashMap, Double>(); final Map, Double> factorsELMap = new ConcurrentHashMap, Double>(); @@ -323,14 +331,16 @@ public void run(){ throw new RuntimeException(e); } - List factorsER = new ArrayList(subsAll.size()); - List factorsEL = new ArrayList(subsAll.size()); - List factorsBeta = new ArrayList(subsAll.size()); - List maxFactors = new ArrayList(subsAll.size()); + samplesExecutor.shutdown(); - for(int i : series(subsAll.size())) + List factorsER = new ArrayList(subs.size()); + List factorsEL = new ArrayList(subs.size()); + List factorsBeta = new ArrayList(subs.size()); + List maxFactors = new ArrayList(subs.size()); + + for(int i : series(subs.size())) { - Graph sub = subsAll.get(i); + Graph sub = subs.get(i); factorsER.add(factorsERMap.get(sub)); factorsEL.add(factorsELMap.get(sub)); factorsBeta.add(factorsBetaMap.get(sub)); @@ -340,21 +350,21 @@ public void run(){ Comparator comp = Functions.natural(); Functions.sort( factorsBeta, Collections.reverseOrder(comp), - (List) frequenciesAll, + (List) frequencies, (List) factorsER, (List) factorsEL, (List) factorsBeta, - (List) subsAll); + (List) subs); File numbersFile = new File("numbers.csv"); BufferedWriter numbersWriter = new BufferedWriter(new FileWriter(numbersFile)); - for(int i : series(subsAll.size())) - numbersWriter.write(frequenciesAll.get(i) + ", " + factorsER.get(i) + ", " + factorsEL.get(i) + ", " + factorsBeta.get(i) + "\n"); + for(int i : series(subs.size())) + numbersWriter.write(frequencies.get(i) + ", " + factorsER.get(i) + ", " + factorsEL.get(i) + ", " + factorsBeta.get(i) + "\n"); numbersWriter.close(); int i = 0; - for(Graph sub : subsAll) + for(Graph sub : subs) { File graphFile = new File(String.format("motif.%03d.edgelist", i)); Data.writeEdgeList(sub, graphFile); @@ -369,7 +379,7 @@ public void run(){ obj.put("baseline el", baselineEL); obj.put("baseline beta", baselineBeta); Functions.write(obj.toString(), new File( "metadata.json")); - + try { FileIO.python(new File("."), "scripts/plot.py"); diff --git a/src/main/java/nl/peterbloem/motive/exec/CompareLarge.java b/src/main/java/nl/peterbloem/motive/exec/CompareLarge.java index c120980..45e2f12 100644 --- a/src/main/java/nl/peterbloem/motive/exec/CompareLarge.java +++ b/src/main/java/nl/peterbloem/motive/exec/CompareLarge.java @@ -48,6 +48,7 @@ import org.nodes.compression.NeighborListCompressor; import org.nodes.data.Data; import org.nodes.models.DSequenceEstimator; +import static org.nodes.models.DSequenceEstimator.D; import org.nodes.models.DegreeSequenceModel.Prior; import org.nodes.models.ERSimpleModel; import org.nodes.models.EdgeListModel; @@ -125,12 +126,19 @@ public class CompareLarge * Whether to reset the DM model at every motif instance. */ private boolean resets = true; + + /** + * Whether to loop over the graph or over the instances. + */ + public boolean graphLoop = false; public void main() throws IOException { nl.peterbloem.kit.Global.secureRandom(42); Global.log().info("Computing motif code lengths"); + + final List degrees = graphLoop ? null : DSequenceEstimator.sequence(data); // * Sample for motifs, and collect the results @@ -150,7 +158,7 @@ public void main() throws IOException for(Graph sub : subsAll) occurrences.add(ex.occurrences((DGraph)sub)); - // - select the top motifs by frquency + // - select the top motifs by frequency final List> subs; final List frequencies; @@ -191,7 +199,9 @@ public void run() { Global.log().info("null model: ER"); - double sizeER = MotifSearchModel.sizeER(data, sub, occs, resets, searchDepth); + double sizeER = graphLoop ? + MotifSearchModel.sizeER(data, sub, occs, resets, searchDepth) : + MotifSearchModel.sizeERInst(data, sub, occs, resets, searchDepth); double factorER = baselineER - sizeER; factorsERMap.put(sub, factorER); @@ -203,7 +213,9 @@ public void run() { Global.log().info("null model: EL"); - double sizeEL = MotifSearchModel.sizeEL(data, sub, occs, resets, searchDepth); + double sizeEL = graphLoop ? + MotifSearchModel.sizeEL(data, sub, occs, resets, searchDepth) : + MotifSearchModel.sizeELInst(data, degrees, sub, occs, resets, searchDepth); double factorEL = baselineEL - sizeEL; factorsELMap.put(sub, factorEL); diff --git a/src/main/java/nl/peterbloem/motive/exec/Run.java b/src/main/java/nl/peterbloem/motive/exec/Run.java index 17c650d..f2ddcab 100644 --- a/src/main/java/nl/peterbloem/motive/exec/Run.java +++ b/src/main/java/nl/peterbloem/motive/exec/Run.java @@ -54,11 +54,21 @@ public class Run { @Option(name="--threads", usage="Number of threads to run simultaneaously. Default is the number of cores available. In full mode, there will always be at least 2 concurrent threads, even if this value is 1.") private static int threads = Global.numThreads(); + @Option( + name="--fast.graphloop", + usage="Loop over the graph instead of the instances when computing the score. A little faster when there are many instances, but a lot slower when there are few.") + private static boolean graphLoop = false; + @Option( name="--full.depth", usage="The search depth for the DS model.") private static int dsDepth = 3; + @Option( + name="--full.mix", + usage="What proportion of available cores to use for motif computation (with the rest used for sampling). Changing this parameter won't affect the end result, but it might lead to better utilitzation of the available cores. If 1.0, the motif scores are computed one by one, sequentially, and the free cores are used to sample for the DS model. If 0.0, the motif scores are computed in parallel, and sampling is done single-threaded.") + private static double mix = 0.4; + @Option(name="--help", usage="Print usage information.", aliases={"-h"}, help=true) private static boolean help = false; @@ -190,6 +200,7 @@ else if ("gml".equals(filetype.toLowerCase().trim())) large.motifMaxSize = maxSize; large.maxMotifs = maxMotifs; large.motifSamples = samples; + large.graphLoop = graphLoop; Global.log().info("Starting experiment."); Functions.tic(); @@ -235,6 +246,7 @@ else if ("gml".equals(filetype.toLowerCase().trim())) full.maxMotifs = maxMotifs; full.motifSamples = samples; full.betaSearchDepth = dsDepth; + full.mix = mix; Global.log().info("Starting experiment."); Functions.tic(); diff --git a/src/test/java/nl/peterbloem/motive/MotifModelTest.java b/src/test/java/nl/peterbloem/motive/MotifModelTest.java index 935dbc6..c0ab87a 100644 --- a/src/test/java/nl/peterbloem/motive/MotifModelTest.java +++ b/src/test/java/nl/peterbloem/motive/MotifModelTest.java @@ -2,6 +2,7 @@ import static java.lang.Math.max; import static java.lang.Math.min; +import static java.util.Arrays.asList; import static nl.peterbloem.kit.Functions.log2Choose; import static nl.peterbloem.kit.Functions.log2Factorial; import static nl.peterbloem.kit.Functions.prefix; @@ -13,9 +14,11 @@ import java.io.File; import java.io.IOException; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collections; import java.util.HashSet; import java.util.LinkedHashSet; +import java.util.LinkedList; import java.util.List; import java.util.Random; import java.util.Set; @@ -532,4 +535,280 @@ public void sparseTest() MotifModel.sizeEL(graph, sub, ex.occurrences(sub), true); } + + @Test + public void instanceLoopTest() + { + UGraph graph = new MapUTGraph(); + + UNode a = graph.add("x"); + UNode b = graph.add("x"); + UNode c = graph.add("x"); + UNode d = graph.add("x"); + UNode e = graph.add("x"); + UNode f = graph.add("x"); + + a.connect(b); + a.connect(c); + b.connect(c); + b.connect(e); + c.connect(d); + c.connect(e); + d.connect(e); + d.connect(f); + e.connect(f); + + UGraph sub = new MapUTGraph(); + + UNode x = sub.add("x"); + UNode y = sub.add("x"); + UNode z = sub.add("x"); + + x.connect(y); + y.connect(z); + z.connect(x); + + List> occurrences = asList(asList(0, 1, 2), asList(3, 4, 5)); + List degrees = Graphs.degrees(graph); + + assertEquals(asList(1, 1), + MotifModel.subbedDegrees(graph, degrees, occurrences, + new FrequencyModel>(), + new LinkedList>())); + + double bitsOld = MotifModel.sizeEL(graph, sub, occurrences, true); + double bitsNew = MotifModel.sizeEL(graph, degrees, sub, occurrences, true); + + assertEquals(bitsOld, bitsNew, 0.000000000000001); + } + + @Test + public void instanceLoopTest2() + { + UGraph graph = new MapUTGraph(); + + UNode a = graph.add("x"); + UNode b = graph.add("x"); + UNode c = graph.add("x"); + UNode d = graph.add("x"); + UNode e = graph.add("x"); + UNode f = graph.add("x"); + + UNode g = graph.add("x"); + UNode h = graph.add("x"); + UNode i = graph.add("x"); + + a.connect(b); + a.connect(c); + b.connect(c); + b.connect(e); + c.connect(d); + c.connect(e); + d.connect(e); + d.connect(f); + e.connect(f); + + f.connect(g); + g.connect(h); + h.connect(i); + i.connect(f); + + UGraph sub = new MapUTGraph(); + + UNode x = sub.add("x"); + UNode y = sub.add("x"); + UNode z = sub.add("x"); + + x.connect(y); + y.connect(z); + z.connect(x); + + List> occurrences = asList(asList(0, 1, 2), asList(3, 4, 5)); + List degrees = Graphs.degrees(graph); + System.out.println(degrees); + + assertEquals(asList(1, 3, 2, 2, 2), + MotifModel.subbedDegrees(graph, degrees, occurrences, + new FrequencyModel>(), + new LinkedList>())); + + double bitsOld = MotifModel.sizeEL(graph, sub, occurrences, true); + double bitsNew = MotifModel.sizeEL(graph, degrees, sub, occurrences, true); + + assertEquals(bitsOld, bitsNew, 0.000000000000001); + } + + @Test + public void instanceLoopTest3() + { + for(int i : series(20)) + { + UGraph graph = RandomGraphs.random(100, 200); + + UPlainMotifExtractor ex = new UPlainMotifExtractor(graph, 100, 3, 4, 1); + List degrees = Graphs.degrees(graph); + + for(UGraph sub : ex.subgraphs()) + { + double sizeSlow = MotifModel.size(graph, sub, ex.occurrences(sub), new EdgeListModel(Prior.COMPLETE), true); + double sizeOld = MotifModel.sizeEL(graph, sub, ex.occurrences(sub), true); + double sizeNew = MotifModel.sizeEL(graph, degrees, sub, ex.occurrences(sub), true); + + + assertEquals(sizeOld, sizeNew, 0.000001); + assertEquals(sizeSlow, sizeNew, 0.000001); + + } + } + } + + @Test + public void instanceLoopTestDirected() + { + for(int i : series(1)) + { + DGraph graph = RandomGraphs.randomDirected(10, 10); + + DPlainMotifExtractor ex = new DPlainMotifExtractor(graph, 100, 3, 4, 1); + + List degrees = DSequenceEstimator.sequence(graph); + + for(DGraph sub : ex.subgraphs()) + { + System.out.println("sub " + sub + ", " + ex.occurrences(sub).size() + " occurrences"); + double sizeSlow = MotifModel.size(graph, sub, ex.occurrences(sub), new EdgeListModel(Prior.COMPLETE), true); + double sizeOld = MotifModel.sizeEL(graph, sub, ex.occurrences(sub), true); + double sizeNew = MotifModel.sizeEL(graph, degrees, sub, ex.occurrences(sub), true); + + assertEquals(sizeOld, sizeNew, 0.000001); + assertEquals(sizeSlow, sizeNew, 0.000001); + + } + } + } + + +// @Test + public void instanceLoopTestTiming() + { + UGraph graph = RandomGraphs.randomFast(100000, 2000000); + System.out.println("graph generated."); + + UPlainMotifExtractor ex = new UPlainMotifExtractor(graph, 1000000, 3, 6, 1); + List degrees = Graphs.degrees(graph); + + for(UGraph sub : ex.subgraphs()) + { + System.out.println(ex.occurrences(sub).size() + " occurrences:"); + double tOld, tNew; + tic(); + double sizeOld = MotifModel.sizeER(graph, sub, ex.occurrences(sub), true); + tOld = toc(); + + tic(); + double sizeNew = MotifModel.sizeERInst(graph, sub, ex.occurrences(sub), true); + tNew = toc(); + + System.out.println(" old: " + tOld + " seconds"); + System.out.println(" new: " + tNew + " seconds"); + + } + } + +// @Test + public void instanceLoopDirectedTiming() + { + DGraph graph = RandomGraphs.randomDirectedFast(100000, 2000000); + System.out.println("graph generated."); + + DPlainMotifExtractor ex = new DPlainMotifExtractor(graph, 100000, 3, 6, 1); + List degrees = DSequenceEstimator.sequence(graph); + + for(DGraph sub : ex.subgraphs()) + { + System.out.println(ex.occurrences(sub).size() + " occurrences:"); + double tOld, tNew; + tic(); + double sizeOld = MotifModel.sizeER(graph, sub, ex.occurrences(sub), true); + + tOld = toc(); + + tic(); + double sizeNew = MotifModel.sizeERInst(graph, sub, ex.occurrences(sub), true); + tNew = toc(); + + System.out.println(" old: " + tOld + " seconds"); + System.out.println(" new: " + tNew + " seconds"); + + } + } + + public void instanceLoopTestTiming2() + { + UGraph graph = RandomGraphs.randomFast(10000, 100000); + System.out.println("graph generated."); + + UPlainMotifExtractor ex = new UPlainMotifExtractor(graph, 100000, 3, 5, 1); + List degrees = Graphs.degrees(graph); + + UGraph sub = ex.subgraphs().get(0); + + tic(); + for(int i : series(50)) + MotifModel.sizeEL(graph, degrees, sub, ex.occurrences(sub), true); + System.out.println(toc() + " seconds."); + + } + + + @Test + public void instanceLoopTestERU() + { + for(int i : series(20)) + { + UGraph graph = RandomGraphs.random(100, 200); + + UPlainMotifExtractor ex = new UPlainMotifExtractor(graph, 100, 3, 4, 1); + List degrees = Graphs.degrees(graph); + + for(UGraph sub : ex.subgraphs()) + { + double sizeSlow = MotifModel.size(graph, sub, ex.occurrences(sub), new ERSimpleModel(true), true); + double sizeOld = MotifModel.sizeER(graph, sub, ex.occurrences(sub), true); + double sizeNew = MotifModel.sizeERInst(graph, sub, ex.occurrences(sub), true); + + System.out.println(sizeSlow + " " + sizeOld + " " + sizeNew); + + assertEquals(sizeOld, sizeNew, 0.000001); + assertEquals(sizeSlow, sizeNew, 0.000001); + + } + } + } + + + @Test + public void instanceLoopTestERD() + { + for(int i : series(20)) + { + DGraph graph = RandomGraphs.randomDirectedFast(100, 200); + + DPlainMotifExtractor ex = new DPlainMotifExtractor(graph, 100, 3, 4, 1); + + for(DGraph sub : ex.subgraphs()) + { + double sizeSlow = MotifModel.size(graph, sub, ex.occurrences(sub), new ERSimpleModel(true), true); + double sizeOld = MotifModel.sizeER(graph, sub, ex.occurrences(sub), true); + double sizeNew = MotifModel.sizeERInst(graph, sub, ex.occurrences(sub), true); + + System.out.println(sizeSlow + " " + sizeOld + " " + sizeNew); + + assertEquals(sizeOld, sizeNew, 0.000001); + assertEquals(sizeSlow, sizeNew, 0.000001); + + } + } + } + }