From 74d34490bd035c4845c0b0f63df84379bad1f4dd Mon Sep 17 00:00:00 2001 From: Jake Biesinger Date: Wed, 20 Nov 2013 16:31:28 -0800 Subject: [PATCH 01/12] BUILD-profile optimize: use static final EDGETYPE.values not values() --- .../genomix/minicluster/GenerateGraphViz.java | 2 +- .../java/edu/uci/ics/genomix/type/DIR.java | 4 ++-- .../edu/uci/ics/genomix/type/EDGETYPE.java | 5 ++-- .../java/edu/uci/ics/genomix/type/Node.java | 24 +++++++++---------- .../edu/uci/ics/genomix/type/NodeTest.java | 13 +++++----- .../contrailgraphbuilding/GenomixReducer.java | 2 +- .../genomix/hadoop/graph/GraphStatistics.java | 2 +- .../AggregateKmerAggregateFactory.java | 4 ++-- .../operator/DeBruijnGraphCleanVertex.java | 6 ++--- .../bubblemerge/SimpleBubbleMergeVertex.java | 2 +- .../pathmerge/BasicPathMergeVertex.java | 6 ++--- .../UnrollTandemRepeat.java | 4 ++-- .../pregelix/testhelper/BubbleAddVertex.java | 2 +- 13 files changed, 39 insertions(+), 37 deletions(-) diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/minicluster/GenerateGraphViz.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/minicluster/GenerateGraphViz.java index ea10401eb..af657f312 100644 --- a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/minicluster/GenerateGraphViz.java +++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/minicluster/GenerateGraphViz.java @@ -148,7 +148,7 @@ public static byte[] convertGraphToImg(JobConf conf, String srcDir, String destD public static String convertEdgeToGraph(String outputNode, Node value, GRAPH_TYPE graphType) { String outputEdge = ""; - for (EDGETYPE et : EDGETYPE.values()) { + for (EDGETYPE et : EDGETYPE.values) { for (Entry e : value.getEdgeMap(et).entrySet()) { String destNode = ""; switch (graphType) { diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/DIR.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/DIR.java index 533787720..90c2ee970 100644 --- a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/DIR.java +++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/DIR.java @@ -44,11 +44,11 @@ public static byte fromSet(EnumSet set) { return b; } - public final EnumSet edgeTypes() { + public final EDGETYPE[] edgeTypes() { return edgeTypesInDir(this); } - public static final EnumSet edgeTypesInDir(DIR direction) { + public static final EDGETYPE[] edgeTypesInDir(DIR direction) { return direction == DIR.REVERSE ? EDGETYPE.INCOMING : EDGETYPE.OUTGOING; } diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/EDGETYPE.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/EDGETYPE.java index 01f589be6..076553e99 100644 --- a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/EDGETYPE.java +++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/EDGETYPE.java @@ -26,8 +26,9 @@ public final byte get() { return val; } - public static final EnumSet INCOMING = EnumSet.of(RF, RR); - public static final EnumSet OUTGOING = EnumSet.of(FF, FR); + public static final EDGETYPE[] values = {FF, FR, RF, RR}; + public static final EDGETYPE[] INCOMING = {RF, RR}; + public static final EDGETYPE[] OUTGOING = {FF, FR}; public static EDGETYPE fromByte(short b) { b &= MASK; diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/Node.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/Node.java index 37ac8eb48..b42f9debe 100644 --- a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/Node.java +++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/Node.java @@ -212,7 +212,7 @@ public EDGETYPE getNeighborEdgeType(DIR direction) { if (degree(direction) != 1) throw new IllegalArgumentException( "getEdgetypeFromDir is used on the case, in which the vertex has and only has one EDGETYPE!"); - EnumSet ets = direction.edgeTypes(); + EDGETYPE[] ets = direction.edgeTypes(); for (EDGETYPE et : ets) { if (edges[et.get()] != null && getEdgeMap(et).size() > 0) { return et; @@ -268,7 +268,7 @@ public EdgeMap[] getEdges() { } public void setEdges(EdgeMap[] edges) { - for (EDGETYPE et : EDGETYPE.values()) { + for (EDGETYPE et : EDGETYPE.values) { setEdgeMap(et, edges[et.get()]); } } @@ -343,7 +343,7 @@ public void setFlippedReadIds(ReadHeadSet flippedReadIds) { */ public int getSerializedLength() { int length = Byte.SIZE / 8; // byte header - for (EDGETYPE e : EDGETYPE.values()) { + for (EDGETYPE e : EDGETYPE.values) { if (edges[e.get()] != null && edges[e.get()].size() > 0) { length += edges[e.get()].getLengthInBytes(); } @@ -377,7 +377,7 @@ public void setAsCopy(byte[] data, int offset) { reset(); byte activeFields = data[offset]; offset += 1; - for (EDGETYPE et : EDGETYPE.values()) { + for (EDGETYPE et : EDGETYPE.values) { // et.get() is the index of the bit; if non-zero, we this edge is present in the stream if ((activeFields & (1 << et.get())) != 0) { getEdgeMap(et).setAsCopy(data, offset); @@ -406,7 +406,7 @@ public void setAsReference(byte[] data, int offset) { reset(); byte activeFields = data[offset]; offset += 1; - for (EDGETYPE et : EDGETYPE.values()) { + for (EDGETYPE et : EDGETYPE.values) { // et.get() is the index of the bit; if non-zero, we this edge is present in the stream if ((activeFields & (1 << et.get())) != 0) { getEdgeMap(et).setAsReference(data, offset); @@ -433,7 +433,7 @@ public void setAsReference(byte[] data, int offset) { public static void write(Node n, DataOutput out) throws IOException { out.writeByte(n.getActiveFields()); - for (EDGETYPE e : EDGETYPE.values()) { + for (EDGETYPE e : EDGETYPE.values) { if (n.edges[e.get()] != null && n.edges[e.get()].size() > 0) { n.edges[e.get()].write(out); } @@ -461,7 +461,7 @@ public void write(DataOutput out) throws IOException { public void readFields(DataInput in) throws IOException { reset(); byte activeFields = in.readByte(); - for (EDGETYPE et : EDGETYPE.values()) { + for (EDGETYPE et : EDGETYPE.values) { // et.get() is the index of the bit; if non-zero, we this edge is present in the stream if ((activeFields & (1 << et.get())) != 0) { getEdgeMap(et).readFields(in); @@ -492,7 +492,7 @@ protected static class NODE_FIELDS { protected byte getActiveFields() { byte fields = 0; // bits 0-3 are for presence of edges - for (EDGETYPE et : EDGETYPE.values()) { + for (EDGETYPE et : EDGETYPE.values) { if (edges[et.get()] != null && edges[et.get()].size() > 0) { fields |= 1 << et.get(); } @@ -530,7 +530,7 @@ public boolean equals(Object o) { return false; Node nw = (Node) o; - for (EDGETYPE et : EDGETYPE.values()) { + for (EDGETYPE et : EDGETYPE.values) { // If I'm null, return false if he's not null; otherwise, do a regular .equals if (edges[et.get()] == null ? nw.edges[et.get()] != null : edges[et.get()].equals(nw.edges[et.get()])) { return false; @@ -550,7 +550,7 @@ public boolean equals(Object o) { public String toString() { StringBuilder sbuilder = new StringBuilder(); sbuilder.append('{'); - for (EDGETYPE et : EDGETYPE.values()) { + for (EDGETYPE et : EDGETYPE.values) { sbuilder.append(et + ":").append(edges[et.get()] == null ? "null" : edges[et.get()].toString()) .append('\t'); } @@ -740,7 +740,7 @@ public void mergeEdges(EDGETYPE edgeType, Node other) { protected void addEdges(boolean flip, Node other) { if (!flip) { - for (EDGETYPE et : EDGETYPE.values()) { + for (EDGETYPE et : EDGETYPE.values) { unionUpdateEdgeMap(et, et, other.edges); } } else { @@ -848,7 +848,7 @@ protected void mergeUnflippedAndFlippedReadIDs(EDGETYPE edgeType, Node other) { * Debug helper function to find the edge associated with the given kmer, checking all directions. If the edge doesn't exist in any direction, returns null */ public NeighborInfo findEdge(final VKmer kmer) { - for (EDGETYPE et : EDGETYPE.values()) { + for (EDGETYPE et : EDGETYPE.values) { if (edges[et.get()] != null && edges[et.get()].containsKey(kmer)) { return new NeighborInfo(et, kmer, edges[et.get()].get(kmer)); } diff --git a/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/type/NodeTest.java b/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/type/NodeTest.java index 4bf4cf741..79f545860 100644 --- a/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/type/NodeTest.java +++ b/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/type/NodeTest.java @@ -7,6 +7,7 @@ import java.io.IOException; import java.util.AbstractMap.SimpleEntry; import java.util.ArrayList; +import java.util.Arrays; import java.util.EnumSet; import java.util.Iterator; import java.util.Map; @@ -48,7 +49,7 @@ public static void assembleNodeRandomly(Node targetNode, int orderNum) { ArrayList> sampleList; SimpleEntry edgeId; EdgeMap edge; - for (EDGETYPE e : EDGETYPE.values()) { + for (EDGETYPE e : EDGETYPE.values) { sampleList = new ArrayList>(); for (int i = 0; i < min + (int) (Math.random() * ((max - min) + 1)); i++) { String edgeStr = generateString(orderNum); @@ -77,7 +78,7 @@ public static void assembleNodeRandomly(Node targetNode, int orderNum) { public static void printSrcNodeInfo(Node srcNode) { System.out.println("InternalKmer: " + srcNode.getInternalKmer().toString()); - for (EDGETYPE e : EDGETYPE.values()) { + for (EDGETYPE e : EDGETYPE.values) { System.out.println(e.toString()); for (Map.Entry iter : srcNode.getEdgeMap(e).entrySet()) { System.out.println("edgeKmer: " + iter.getKey().toString()); @@ -101,7 +102,7 @@ public static void printSrcNodeInfo(Node srcNode) { public static void compareTwoNodes(Node et1, Node et2) { Assert.assertEquals(et1.getInternalKmer().toString(), et2.getInternalKmer().toString()); - for (EDGETYPE e : EDGETYPE.values()) { + for (EDGETYPE e : EDGETYPE.values) { Assert.assertEquals(et1.getEdgeMap(e).size(), et2.getEdgeMap(e).size()); for (Map.Entry iter1 : et1.getEdgeMap(e).entrySet()) { Map.Entry iter2 = et2.getEdgeMap(e).pollFirstEntry(); @@ -127,7 +128,7 @@ public static void getEdgeMapRandomly(EdgeMap edgeMap, int orderNum) { int max = 4; ArrayList> sampleList; SimpleEntry edgeId; - for (EDGETYPE e : EDGETYPE.values()) { + for (EDGETYPE e : EDGETYPE.values) { sampleList = new ArrayList>(); for (int i = 0; i < min + (int) (Math.random() * ((max - min) + 1)); i++) { String edgeStr = generateString(orderNum); @@ -188,9 +189,9 @@ public void testDIR() throws IOException { Assert.assertEquals(0b11 << 2, DIR.fromSet(EnumSet.allOf(DIR.class))); Assert.assertEquals(0b00 << 2, DIR.fromSet(EnumSet.noneOf(DIR.class))); - EnumSet edgeTypes1 = testDir1.edgeTypes(); + EnumSet edgeTypes1 = EnumSet.copyOf(Arrays.asList(testDir1.edgeTypes())); EnumSet edgeExample1 = EnumSet.noneOf(EDGETYPE.class); - EnumSet edgeTypes2 = testDir2.edgeTypes(); + EnumSet edgeTypes2 = EnumSet.copyOf(Arrays.asList(testDir2.edgeTypes())); EnumSet edgeExample2 = EnumSet.noneOf(EDGETYPE.class); edgeExample1.add(EDGETYPE.FF); edgeExample1.add(EDGETYPE.FR); diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/contrailgraphbuilding/GenomixReducer.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/contrailgraphbuilding/GenomixReducer.java index ce513835e..64247c764 100644 --- a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/contrailgraphbuilding/GenomixReducer.java +++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/contrailgraphbuilding/GenomixReducer.java @@ -37,7 +37,7 @@ public void reduce(VKmer key, Iterator values, OutputCollector outpu long totalEdgeReads = 0; long totalSelf = 0; - for (EDGETYPE et : EDGETYPE.values()) { + for (EDGETYPE et : EDGETYPE.values) { for (Entry e : value.getEdgeMap(et).entrySet()) { totalEdgeReads += e.getValue().size(); if (e.getKey().equals(key)) { diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/graph/dataflow/AggregateKmerAggregateFactory.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/graph/dataflow/AggregateKmerAggregateFactory.java index 587b80d67..e5ebf2dbe 100644 --- a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/graph/dataflow/AggregateKmerAggregateFactory.java +++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/graph/dataflow/AggregateKmerAggregateFactory.java @@ -106,7 +106,7 @@ public void init(ArrayTupleBuilder tupleBuilder, IFrameTupleAccessor accessor, i // } // } - for (EDGETYPE e : EDGETYPE.values()) { + for (EDGETYPE e : EDGETYPE.values) { localUniNode.getEdgeMap(e).unionUpdate((readNode.getEdgeMap(e))); } localUniNode.getUnflippedReadIds().addAll(readNode.getUnflippedReadIds()); @@ -121,7 +121,7 @@ public void aggregate(IFrameTupleAccessor accessor, int tIndex, IFrameTupleAcces Node localUniNode = (Node) state.state; readNode.setAsCopy(accessor.getBuffer().array(), getOffSet(accessor, tIndex, 1)); - for (EDGETYPE e : EDGETYPE.values()) { + for (EDGETYPE e : EDGETYPE.values) { localUniNode.getEdgeMap(e).unionUpdate(readNode.getEdgeMap(e)); } localUniNode.getUnflippedReadIds().addAll(readNode.getUnflippedReadIds()); diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/DeBruijnGraphCleanVertex.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/DeBruijnGraphCleanVertex.java index 8d9044e5d..dcd918ba1 100644 --- a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/DeBruijnGraphCleanVertex.java +++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/DeBruijnGraphCleanVertex.java @@ -189,7 +189,7 @@ public VKmer getDestVertexId(DIR direction) { + getVertexValue().toString()); if (degree == 1) { - EnumSet edgeTypes = direction.edgeTypes(); + EDGETYPE[] edgeTypes = direction.edgeTypes(); for (EDGETYPE et : edgeTypes) { if (getVertexValue().getEdgeMap(et).size() > 0) return getVertexValue().getEdgeMap(et).firstKey(); @@ -205,7 +205,7 @@ public VKmer getDestVertexId(DIR direction) { * check if I am a tandemRepeat */ public boolean isTandemRepeat(VertexValueWritable value) { - for (EDGETYPE et : EDGETYPE.values()) { + for (EDGETYPE et : EDGETYPE.values) { for (VKmer kmerToCheck : value.getEdgeMap(et).keySet()) { if (kmerToCheck.equals(getVertexId())) { repeatEdgetype = et; @@ -222,7 +222,7 @@ public boolean isTandemRepeat(VertexValueWritable value) { */ public void broadcastKillself() { VertexValueWritable vertex = getVertexValue(); - for (EDGETYPE et : EDGETYPE.values()) { + for (EDGETYPE et : EDGETYPE.values) { for (VKmer dest : vertex.getEdgeMap(et).keySet()) { outgoingMsg.reset(); outFlag &= EDGETYPE.CLEAR; diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/bubblemerge/SimpleBubbleMergeVertex.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/bubblemerge/SimpleBubbleMergeVertex.java index e16bf3f6c..5094dc55d 100644 --- a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/bubblemerge/SimpleBubbleMergeVertex.java +++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/bubblemerge/SimpleBubbleMergeVertex.java @@ -221,7 +221,7 @@ public void receiveUpdates(Iterator msgIterator) { deleteVertex(getVertexId()); break; case ADD_READIDS: - for (EDGETYPE et : EDGETYPE.values()) { + for (EDGETYPE et : EDGETYPE.values) { EdgeMap edgeMap = incomingMsg.getNode().getEdgeMap(et); if (edgeMap.size() > 0) { getVertexValue().getEdgeMap(et).unionUpdate(edgeMap); diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/BasicPathMergeVertex.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/BasicPathMergeVertex.java index a160ea799..9993441b4 100644 --- a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/BasicPathMergeVertex.java +++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/BasicPathMergeVertex.java @@ -112,10 +112,10 @@ public void updateNeighbors() { } DIR mergeDir = edgeType.dir(); - EnumSet mergeEdges = mergeDir.edgeTypes(); + EDGETYPE[] mergeEdges = mergeDir.edgeTypes(); DIR updateDir = mergeDir.mirror(); - EnumSet updateEdges = updateDir.edgeTypes(); + EDGETYPE[] updateEdges = updateDir.edgeTypes(); // prepare the update message s.t. the receiver can do a simple unionupdate // that means we figure out any hops and place our merge-dir edges in the appropriate list of the outgoing msg @@ -154,7 +154,7 @@ public void receiveUpdates(Iterator msgIterator) { // remove the edge to the node that will merge elsewhere vertex.getEdgeMap(EDGETYPE.fromByte(incomingMsg.getFlag())).remove(incomingMsg.getSourceVertexId()); // add the node this neighbor will merge into - for (EDGETYPE edgeType : EDGETYPE.values()) { + for (EDGETYPE edgeType : EDGETYPE.values) { vertex.getEdgeMap(edgeType).unionUpdate(incomingMsg.getEdgeList(edgeType)); } updated = true; diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/unrolltandemrepeat/UnrollTandemRepeat.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/unrolltandemrepeat/UnrollTandemRepeat.java index fa3a98ca5..9bca1f8f6 100644 --- a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/unrolltandemrepeat/UnrollTandemRepeat.java +++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/unrolltandemrepeat/UnrollTandemRepeat.java @@ -47,7 +47,7 @@ public boolean repeatCanBeMerged() { tmpValue.getEdgeMap(repeatEdgetype).remove(repeatKmer); boolean hasFlip = false; // pick one edge and flip - for (EDGETYPE et : EDGETYPE.values()) { + for (EDGETYPE et : EDGETYPE.values) { for (Entry edge : tmpValue.getEdgeMap(et).entrySet()) { EDGETYPE flipEt = et.flipNeighbor(); tmpValue.getEdgeMap(flipEt).put(edge.getKey(), edge.getValue()); @@ -75,7 +75,7 @@ public void mergeTandemRepeat() { getVertexValue().getEdgeMap(repeatEdgetype).remove(getVertexId()); boolean hasFlip = false; /** pick one edge and flip **/ - for (EDGETYPE et : EDGETYPE.values()) { + for (EDGETYPE et : EDGETYPE.values) { for (Entry edge : getVertexValue().getEdgeMap(et).entrySet()) { EDGETYPE flipDir = et.flipNeighbor(); getVertexValue().getEdgeMap(flipDir).put(edge.getKey(), edge.getValue()); diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/testhelper/BubbleAddVertex.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/testhelper/BubbleAddVertex.java index 8abb8c30a..2e3221e51 100644 --- a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/testhelper/BubbleAddVertex.java +++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/testhelper/BubbleAddVertex.java @@ -123,7 +123,7 @@ public void addEdgeToInsertedBubble(EDGETYPE meToNewBubbleDir, VKmer insertedBub } public void setupEdgeForInsertedBubble() { - for (EDGETYPE et : EDGETYPE.values()) { + for (EDGETYPE et : EDGETYPE.values) { edges[et.get()] = new EdgeMap(); } edges[newBubbleToMajorEdgetype.get()].put(majorVertexId, new ReadIdSet(Arrays.asList(new Long(readId)))); From cbfaed089960cb6659065d3eadeec6d790579ce8 Mon Sep 17 00:00:00 2001 From: Jake Biesinger Date: Wed, 20 Nov 2013 16:32:02 -0800 Subject: [PATCH 02/12] BUILD-profile optimize: only compile regex once --- .../hyracks/graph/dataflow/ReadsKeyValueParserFactory.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/graph/dataflow/ReadsKeyValueParserFactory.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/graph/dataflow/ReadsKeyValueParserFactory.java index 5f9ef4da1..6ba14a2e2 100644 --- a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/graph/dataflow/ReadsKeyValueParserFactory.java +++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/graph/dataflow/ReadsKeyValueParserFactory.java @@ -54,6 +54,8 @@ public class ReadsKeyValueParserFactory implements IKeyValueParserFactory Date: Wed, 20 Nov 2013 16:33:00 -0800 Subject: [PATCH 03/12] BUILD-profile optimize: copy reference rather than deep copy in build --- .../hyracks/graph/dataflow/ReadsKeyValueParserFactory.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/graph/dataflow/ReadsKeyValueParserFactory.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/graph/dataflow/ReadsKeyValueParserFactory.java index 6ba14a2e2..6f04dce6c 100644 --- a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/graph/dataflow/ReadsKeyValueParserFactory.java +++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/graph/dataflow/ReadsKeyValueParserFactory.java @@ -164,7 +164,7 @@ private void SplitReads(long readID, byte[] readLetters, IFrameWriter writer) { DIR nextNodeDir = DIR.FORWARD; /* middle kmer */ - nextNode.reset(); + nextNode = new Node(); nextNode.setAverageCoverage(1); nextForwardKmer.setAsCopy(curForwardKmer); for (int i = Kmer.getKmerLength(); i < readLetters.length; i++) { @@ -177,9 +177,9 @@ private void SplitReads(long readID, byte[] readLetters, IFrameWriter writer) { curForwardKmer.setAsCopy(nextForwardKmer); curReverseKmer.setAsCopy(nextReverseKmer); - curNode.setAsCopy(nextNode); + curNode = nextNode; curNodeDir = nextNodeDir; - nextNode.reset(); + nextNode = new Node(); nextNode.setAverageCoverage(1); } From d7025f37262f48dd4835af41ee0219d71a53cea5 Mon Sep 17 00:00:00 2001 From: Jake Biesinger Date: Thu, 21 Nov 2013 12:52:06 -0800 Subject: [PATCH 04/12] reformat --- .../edu/uci/ics/genomix/type/EDGETYPE.java | 21 +++++++++---------- .../uci/ics/genomix/type/ReadHeadInfo.java | 7 ++++--- .../edu/uci/ics/genomix/type/ReadHeadSet.java | 7 ++++--- .../dataflow/ReadsKeyValueParserFactory.java | 6 +++--- .../pathmerge/BasicPathMergeVertex.java | 1 - .../pathmerge/P4ForPathMergeVertex.java | 17 +++++++-------- 6 files changed, 29 insertions(+), 30 deletions(-) diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/EDGETYPE.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/EDGETYPE.java index 076553e99..2033f6115 100644 --- a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/EDGETYPE.java +++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/EDGETYPE.java @@ -3,11 +3,10 @@ import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; -import java.util.EnumSet; import org.apache.hadoop.io.Writable; -public enum EDGETYPE implements Writable{ +public enum EDGETYPE implements Writable { FF((byte) (0b00)), FR((byte) (0b01)), @@ -26,9 +25,9 @@ public final byte get() { return val; } - public static final EDGETYPE[] values = {FF, FR, RF, RR}; - public static final EDGETYPE[] INCOMING = {RF, RR}; - public static final EDGETYPE[] OUTGOING = {FF, FR}; + public static final EDGETYPE[] values = { FF, FR, RF, RR }; + public static final EDGETYPE[] INCOMING = { RF, RR }; + public static final EDGETYPE[] OUTGOING = { FF, FR }; public static EDGETYPE fromByte(short b) { b &= MASK; @@ -65,14 +64,14 @@ public static EDGETYPE mirror(EDGETYPE edgeType) { throw new RuntimeException("Unrecognized direction in mirrorDirection: " + edgeType); } } - + /** * */ - public static EDGETYPE getEdgeTypeFromDirToDir(DIR dir1, DIR dir2){ - switch(dir1){ + public static EDGETYPE getEdgeTypeFromDirToDir(DIR dir1, DIR dir2) { + switch (dir1) { case FORWARD: - switch(dir2){ + switch (dir2) { case FORWARD: return FF; case REVERSE: @@ -81,7 +80,7 @@ public static EDGETYPE getEdgeTypeFromDirToDir(DIR dir1, DIR dir2){ throw new IllegalArgumentException("Invalid direction2 given: " + dir2); } case REVERSE: - switch(dir2){ + switch (dir2) { case FORWARD: return RF; case REVERSE: @@ -93,7 +92,7 @@ public static EDGETYPE getEdgeTypeFromDirToDir(DIR dir1, DIR dir2){ throw new IllegalArgumentException("Invalid direction1 given: " + dir2); } } - + public DIR dir() { return dir(this); } diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/ReadHeadInfo.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/ReadHeadInfo.java index fc8bca24b..77f36c5bb 100644 --- a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/ReadHeadInfo.java +++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/ReadHeadInfo.java @@ -16,7 +16,7 @@ public class ReadHeadInfo implements WritableComparable, Serializa private static final int bitsForPosition = 16; private static final int readIdShift = bitsForPosition + bitsForMate; private static final int positionIdShift = bitsForMate; - + private long value; public ReadHeadInfo(byte mateId, long readId, int offset) { @@ -93,12 +93,13 @@ public String toString() { return this.getReadId() + "-" + this.getOffset() + "_" + (this.getMateId()); } - /** sort by readId, then mateId, then offset + /** + * sort by readId, then mateId, then offset */ @Override public int compareTo(ReadHeadInfo o) { if (this.getReadId() == o.getReadId()) { - if(this.getMateId() == o.getMateId()){ + if (this.getMateId() == o.getMateId()) { return this.getOffset() - o.getOffset(); } return this.getMateId() - o.getMateId(); diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/ReadHeadSet.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/ReadHeadSet.java index 82aa89b32..62ba67e5a 100644 --- a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/ReadHeadSet.java +++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/ReadHeadSet.java @@ -46,11 +46,12 @@ public ReadHeadInfo getReadHeadInfoFromReadId(long readId) { } public int getOffsetFromReadId(long readId) { - for(ReadHeadInfo readHeadInfo : this){ - if(readHeadInfo.getReadId() == readId) + for (ReadHeadInfo readHeadInfo : this) { + if (readHeadInfo.getReadId() == readId) return readHeadInfo.getOffset(); } - throw new IllegalArgumentException("The input parameter readId " + readId + " should exist in this ReadHeadSet, but not here!"); + throw new IllegalArgumentException("The input parameter readId " + readId + + " should exist in this ReadHeadSet, but not here!"); } public void setAsCopy(byte[] data, int offset) { diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/graph/dataflow/ReadsKeyValueParserFactory.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/graph/dataflow/ReadsKeyValueParserFactory.java index 6f04dce6c..8a56059d3 100644 --- a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/graph/dataflow/ReadsKeyValueParserFactory.java +++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/graph/dataflow/ReadsKeyValueParserFactory.java @@ -54,11 +54,11 @@ public class ReadsKeyValueParserFactory implements IKeyValueParserFactory msgIterator) { short state = vertex.getState(); boolean updated = false; EDGETYPE senderEdgetype; -// int numMerged = 0; + // int numMerged = 0; while (msgIterator.hasNext()) { PathMergeMessage incomingMsg = msgIterator.next(); if (verbose) { @@ -208,7 +207,7 @@ public void receiveMerges(Iterator msgIterator) { senderEdgetype = EDGETYPE.fromByte(incomingMsg.getFlag()); node.mergeWithNode(senderEdgetype, incomingMsg.getNode()); state |= (byte) (incomingMsg.getFlag() & DIR.MASK); // update incoming restricted directions -// numMerged++; + // numMerged++; updated = true; if (verbose) { LOG.fine("after merge: " + getVertexValue() + " restrictions: " + DIR.enumSetFromByte(state)); From ea862b4d4c801f3207961c78c46f09867b45470f Mon Sep 17 00:00:00 2001 From: Jake Biesinger Date: Thu, 21 Nov 2013 12:54:23 -0800 Subject: [PATCH 05/12] BUILD-profile optimize: remove getLengths() and return offsets from copy ops --- .../edu/uci/ics/genomix/type/EdgeMap.java | 43 +++++++------ .../java/edu/uci/ics/genomix/type/Node.java | 61 +++++-------------- .../edu/uci/ics/genomix/type/ReadHeadSet.java | 7 +-- .../edu/uci/ics/genomix/type/ReadIdSet.java | 7 +-- .../java/edu/uci/ics/genomix/type/VKmer.java | 9 +-- .../edu/uci/ics/genomix/type/EdgeMapTest.java | 2 +- .../edu/uci/ics/genomix/type/NodeTest.java | 8 +-- .../uci/ics/genomix/type/ReadIdSetTest.java | 5 +- .../AggregateKmerAggregateFactory.java | 7 ++- .../dataflow/AssembleKeyIntoNodeOperator.java | 3 +- .../dataflow/ReadsKeyValueParserFactory.java | 3 +- 11 files changed, 63 insertions(+), 92 deletions(-) diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/EdgeMap.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/EdgeMap.java index 56f133781..6da3eb4c7 100644 --- a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/EdgeMap.java +++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/EdgeMap.java @@ -33,6 +33,7 @@ public class EdgeMap extends TreeMap implements Writable, Seri private static final long serialVersionUID = 1L; private static final int SIZE_INT = 4; + private static final int INITIAL_BYTE_ARRAY_SIZE = 30; public static boolean logReadIds; // FIXME regression in usage of this (I broke it) public EdgeMap() { @@ -67,58 +68,62 @@ public void setAsCopy(EdgeMap other) { } } - public int getLengthInBytes() { - int total = SIZE_INT; - for (Entry e : entrySet()) { - total += e.getKey().getLength() + e.getValue().getLengthInBytes(); - } - return total; - } - /** * Return this Edge's representation as a new byte array */ public byte[] marshalToByteArray() throws IOException { - ByteArrayOutputStream baos = new ByteArrayOutputStream(getLengthInBytes()); + ByteArrayOutputStream baos = new ByteArrayOutputStream(INITIAL_BYTE_ARRAY_SIZE); DataOutputStream out = new DataOutputStream(baos); write(out); return baos.toByteArray(); } - public void setAsCopy(byte[] data, int offset) { + /** + * Populates this map from the given byte array + * + * @param data + * @param offset + * @return the offset pointing just beyond this EdgeMap in the byte array, that is, `offset + len(this)` + */ + public int setAsCopy(byte[] data, int offset) { int curOffset = offset; int count = Marshal.getInt(data, offset); curOffset += SIZE_INT; clear(); for (int i = 0; i < count; i++) { VKmer kmer = new VKmer(); - kmer.setAsCopy(data, curOffset); - curOffset += kmer.getLength(); + curOffset = kmer.setAsCopy(data, curOffset); ReadIdSet ids = new ReadIdSet(); - ids.setAsCopy(data, curOffset); - curOffset += ids.getLengthInBytes(); + curOffset = ids.setAsCopy(data, curOffset); put(kmer, ids); } + return curOffset; } - public void setAsReference(byte[] data, int offset) { + /** + * Populates this map from the given byte array, keeping references where possible + * + * @param data + * @param offset + * @return the offset pointing just beyond this EdgeMap in the byte array, that is, `offset + len(this)` + */ + public int setAsReference(byte[] data, int offset) { int curOffset = offset; int count = Marshal.getInt(data, offset); curOffset += SIZE_INT; clear(); for (int i = 0; i < count; i++) { VKmer kmer = new VKmer(); - kmer.setAsReference(data, curOffset); - curOffset += kmer.getLength(); + curOffset = kmer.setAsReference(data, curOffset); ReadIdSet ids = new ReadIdSet(); - ids.setAsCopy(data, curOffset); - curOffset += ids.getLengthInBytes(); + curOffset = ids.setAsCopy(data, curOffset); put(kmer, ids); } + return curOffset; } @Override diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/Node.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/Node.java index b42f9debe..47c30ebee 100644 --- a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/Node.java +++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/Node.java @@ -15,17 +15,14 @@ package edu.uci.ics.genomix.type; -import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.DataInput; -import java.io.DataInputStream; import java.io.DataOutput; import java.io.DataOutputStream; import java.io.IOException; import java.io.Serializable; import java.util.ArrayList; import java.util.Comparator; -import java.util.EnumSet; import java.util.Iterator; import java.util.List; import java.util.Map.Entry; @@ -136,6 +133,7 @@ public void remove() { } private static final long serialVersionUID = 1L; + private static final int INITIAL_BYTE_ARRAY_SIZE = 150; private EdgeMap[] edges; private ReadHeadSet unflippedReadIds; // first Kmer in read @@ -338,97 +336,66 @@ public void setFlippedReadIds(ReadHeadSet flippedReadIds) { } } - /** - * Returns the length of the byte-array version of this node - */ - public int getSerializedLength() { - int length = Byte.SIZE / 8; // byte header - for (EDGETYPE e : EDGETYPE.values) { - if (edges[e.get()] != null && edges[e.get()].size() > 0) { - length += edges[e.get()].getLengthInBytes(); - } - } - if (unflippedReadIds != null && unflippedReadIds.size() > 0) { - length += unflippedReadIds.getLengthInBytes(); - } - if (flippedReadIds != null && flippedReadIds.size() > 0) { - length += flippedReadIds.getLengthInBytes(); - } - if (internalKmer != null && internalKmer.getKmerLetterLength() > 0) { - length += internalKmer.getLength(); - } - if (averageCoverage != null) { - length += Float.SIZE / 8; - } - return length; - } - /** * Return this Node's representation as a new byte array */ public byte[] marshalToByteArray() throws IOException { - ByteArrayOutputStream baos = new ByteArrayOutputStream(getSerializedLength()); + ByteArrayOutputStream baos = new ByteArrayOutputStream(INITIAL_BYTE_ARRAY_SIZE); DataOutputStream out = new DataOutputStream(baos); write(out); return baos.toByteArray(); } - public void setAsCopy(byte[] data, int offset) { + public int setAsCopy(byte[] data, int offset) { reset(); byte activeFields = data[offset]; offset += 1; for (EDGETYPE et : EDGETYPE.values) { // et.get() is the index of the bit; if non-zero, we this edge is present in the stream if ((activeFields & (1 << et.get())) != 0) { - getEdgeMap(et).setAsCopy(data, offset); - offset += edges[et.get()].getLengthInBytes(); + offset = getEdgeMap(et).setAsCopy(data, offset); } } if ((activeFields & NODE_FIELDS.UNFLIPPED_READ_IDS) != 0) { - getUnflippedReadIds().setAsCopy(data, offset); - offset += unflippedReadIds.getLengthInBytes(); + offset = getUnflippedReadIds().setAsCopy(data, offset); } if ((activeFields & NODE_FIELDS.FLIPPED_READ_IDS) != 0) { - getFlippedReadIds().setAsCopy(data, offset); - offset += flippedReadIds.getLengthInBytes(); + offset = getFlippedReadIds().setAsCopy(data, offset); } if ((activeFields & NODE_FIELDS.INTERNAL_KMER) != 0) { - getInternalKmer().setAsCopy(data, offset); - offset += internalKmer.getLength(); + offset = getInternalKmer().setAsCopy(data, offset); } if ((activeFields & NODE_FIELDS.AVERAGE_COVERAGE) != 0) { averageCoverage = Marshal.getFloat(data, offset); offset += Float.SIZE / 8; } + return offset; } - public void setAsReference(byte[] data, int offset) { + public int setAsReference(byte[] data, int offset) { reset(); byte activeFields = data[offset]; offset += 1; for (EDGETYPE et : EDGETYPE.values) { // et.get() is the index of the bit; if non-zero, we this edge is present in the stream if ((activeFields & (1 << et.get())) != 0) { - getEdgeMap(et).setAsReference(data, offset); - offset += edges[et.get()].getLengthInBytes(); + offset = getEdgeMap(et).setAsReference(data, offset); } } if ((activeFields & NODE_FIELDS.UNFLIPPED_READ_IDS) != 0) { - getUnflippedReadIds().setAsCopy(data, offset); - offset += unflippedReadIds.getLengthInBytes(); + offset = getUnflippedReadIds().setAsCopy(data, offset); } if ((activeFields & NODE_FIELDS.FLIPPED_READ_IDS) != 0) { - getFlippedReadIds().setAsCopy(data, offset); - offset += flippedReadIds.getLengthInBytes(); + offset = getFlippedReadIds().setAsCopy(data, offset); } if ((activeFields & NODE_FIELDS.INTERNAL_KMER) != 0) { - getInternalKmer().setAsReference(data, offset); - offset += internalKmer.getLength(); + offset = getInternalKmer().setAsReference(data, offset); } if ((activeFields & NODE_FIELDS.AVERAGE_COVERAGE) != 0) { averageCoverage = Marshal.getFloat(data, offset); offset += Float.SIZE / 8; } + return offset; } public static void write(Node n, DataOutput out) throws IOException { diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/ReadHeadSet.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/ReadHeadSet.java index 62ba67e5a..6f3c4fddc 100644 --- a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/ReadHeadSet.java +++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/ReadHeadSet.java @@ -54,7 +54,7 @@ public int getOffsetFromReadId(long readId) { + " should exist in this ReadHeadSet, but not here!"); } - public void setAsCopy(byte[] data, int offset) { + public int setAsCopy(byte[] data, int offset) { clear(); int count = Marshal.getInt(data, offset); offset += HEADER_SIZE; @@ -62,6 +62,7 @@ public void setAsCopy(byte[] data, int offset) { add(new ReadHeadInfo(Marshal.getLong(data, offset))); offset += ReadHeadInfo.ITEM_SIZE; } + return offset; } @Override @@ -111,8 +112,4 @@ public static ReadHeadSet getIntersection(ReadHeadSet list1, ReadHeadSet list2) intersection.retainAll(list2); return intersection; } - - public int getLengthInBytes() { - return HEADER_SIZE + ReadHeadInfo.ITEM_SIZE * size(); - } } diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/ReadIdSet.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/ReadIdSet.java index c98807c28..4134d0ff5 100644 --- a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/ReadIdSet.java +++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/ReadIdSet.java @@ -64,11 +64,7 @@ public void readFields(DataInput in) throws IOException { } } - public int getLengthInBytes() { - return HEADER_SIZE + ITEM_SIZE * size(); - } - - public void setAsCopy(byte[] data, int offset) { + public int setAsCopy(byte[] data, int offset) { clear(); int count = Marshal.getInt(data, offset); offset += HEADER_SIZE; @@ -76,5 +72,6 @@ public void setAsCopy(byte[] data, int offset) { add(Marshal.getLong(data, offset)); offset += ITEM_SIZE; } + return offset; } } diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/VKmer.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/VKmer.java index caaa0e0b4..b07e229fe 100644 --- a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/VKmer.java +++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/VKmer.java @@ -25,7 +25,6 @@ import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.io.WritableComparator; -import edu.uci.ics.genomix.type.EDGETYPE; import edu.uci.ics.genomix.util.KmerUtil; import edu.uci.ics.genomix.util.Marshal; @@ -148,10 +147,11 @@ public void setAsCopy(String kmer) { * : byte array to copy (should have a header) * @param offset */ - public void setAsCopy(byte[] newData, int offset) { + public int setAsCopy(byte[] newData, int offset) { int k = Marshal.getInt(newData, offset); reset(k); System.arraycopy(newData, offset + HEADER_SIZE, bytes, this.kmerStartOffset, bytesUsed); + return offset + HEADER_SIZE + bytesUsed; } /** @@ -162,7 +162,7 @@ public void setAsCopy(byte[] newData, int offset) { * : byte array to copy (should have a header) * @param blockOffset */ - public void setAsReference(byte[] newData, int blockOffset) { + public int setAsReference(byte[] newData, int blockOffset) { bytes = newData; kmerStartOffset = blockOffset + HEADER_SIZE; int kRequested = Marshal.getInt(newData, blockOffset); @@ -173,6 +173,7 @@ public void setAsReference(byte[] newData, int blockOffset) { } storageMaxSize = bytesRequested; // since we are a reference, store our max capacity setKmerLength(kRequested); + return blockOffset + bytesRequested; } /** @@ -538,7 +539,7 @@ public void mergeWithFRKmer(int initialKmerSize, VKmer kmer) { revcomp.setReversedFromStringBytes(kmer.getKmerLetterLength(), kmer.toString().getBytes(), 0); mergeWithFFKmer(initialKmerSize, revcomp); } - + /** * Merge Kmer with the next connected Kmer, when that Kmer needs to be * reverse-complemented e.g. AAGCTAA merge with GGTTGTT, if the initial diff --git a/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/type/EdgeMapTest.java b/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/type/EdgeMapTest.java index 752f5e691..b8fa3bbcb 100644 --- a/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/type/EdgeMapTest.java +++ b/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/type/EdgeMapTest.java @@ -151,7 +151,7 @@ public void TestByteStreamReadWrite() throws IOException { sampleList.add(sample); EdgeMap toTest = new EdgeMap(sampleList); //begin test - ByteArrayOutputStream baos = new ByteArrayOutputStream(toTest.getLengthInBytes()); + ByteArrayOutputStream baos = new ByteArrayOutputStream(toTest.marshalToByteArray().length); DataOutputStream out = new DataOutputStream(baos); toTest.write(out); InputStream inputStream = new ByteArrayInputStream(baos.toByteArray()); diff --git a/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/type/NodeTest.java b/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/type/NodeTest.java index 79f545860..86c07a4f5 100644 --- a/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/type/NodeTest.java +++ b/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/type/NodeTest.java @@ -201,8 +201,8 @@ public void testDIR() throws IOException { edgeExample2.add(EDGETYPE.RR); Assert.assertEquals(edgeExample2, edgeTypes2); - Assert.assertEquals(edgeExample1, DIR.edgeTypesInDir(testDir1)); - Assert.assertEquals(edgeExample2, DIR.edgeTypesInDir(testDir2)); + Assert.assertEquals(edgeExample1, EnumSet.copyOf(Arrays.asList(DIR.edgeTypesInDir(testDir1)))); + Assert.assertEquals(edgeExample2, EnumSet.copyOf(Arrays.asList(DIR.edgeTypesInDir(testDir2)))); EnumSet dirExample = EnumSet.noneOf(DIR.class); dirExample.add(DIR.FORWARD); @@ -386,7 +386,7 @@ public void testSetCopyAndRefWithByteArray() throws IOException { for (int i = 10; i < 15; i++) { NodeTest.assembleNodeRandomly(dataNodes[i - 10], i); - nodeOffset[i - 10] = dataNodes[i - 10].getSerializedLength(); + nodeOffset[i - 10] = dataNodes[i - 10].marshalToByteArray().length; outputStream.write(dataNodes[i - 10].marshalToByteArray()); } byte[] dataArray = outputStream.toByteArray(); @@ -513,7 +513,7 @@ public void testSeartReadsAndEndReads() { public void testWriteAndReadFields() throws IOException { Node srcNode = new Node(); NodeTest.assembleNodeRandomly(srcNode, 17); - ByteArrayOutputStream baos = new ByteArrayOutputStream(srcNode.getSerializedLength()); + ByteArrayOutputStream baos = new ByteArrayOutputStream(srcNode.marshalToByteArray().length); DataOutputStream out = new DataOutputStream(baos); srcNode.write(out); ByteArrayInputStream inputStream = new ByteArrayInputStream(baos.toByteArray()); diff --git a/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/type/ReadIdSetTest.java b/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/type/ReadIdSetTest.java index 0e71b88c3..ff10094fd 100644 --- a/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/type/ReadIdSetTest.java +++ b/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/type/ReadIdSetTest.java @@ -13,12 +13,13 @@ public class ReadIdSetTest { /** * basic check for adding and reading operation related to ReadIdSet + * @throws IOException */ @Test - public void Test1() { + public void Test1() throws IOException { EdgeMap elist = new EdgeMap(); Assert.assertEquals(0, elist.size()); - Assert.assertEquals(4, elist.getLengthInBytes()); + Assert.assertEquals(4, elist.marshalToByteArray().length); VKmer kmer1 = new VKmer("ACCGCTTAGATACC"); ReadIdSet plist1 = new ReadIdSet(); diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/graph/dataflow/AggregateKmerAggregateFactory.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/graph/dataflow/AggregateKmerAggregateFactory.java index e5ebf2dbe..af57a5947 100644 --- a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/graph/dataflow/AggregateKmerAggregateFactory.java +++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/graph/dataflow/AggregateKmerAggregateFactory.java @@ -184,11 +184,12 @@ public boolean outputFinalResult(ArrayTupleBuilder tupleBuilder, IFrameTupleAcce // } // } try { - fieldOutput.write(localUniNode.marshalToByteArray(), 0, localUniNode.getSerializedLength()); + byte[] uniNodeBytes = localUniNode.marshalToByteArray(); + fieldOutput.write(uniNodeBytes, 0, uniNodeBytes.length); tupleBuilder.addFieldEndOffset(); - if (localUniNode.getSerializedLength() > frameSize / 2) { + if (uniNodeBytes.length > frameSize / 2) { LOG.warning("Aggregate Kmer: output data kmerByteSize is too big: " - + localUniNode.getSerializedLength() + "\nNode is:" + localUniNode.toString()); + + uniNodeBytes.length + "\nNode is:" + localUniNode.toString()); } } catch (IOException e) { throw new HyracksDataException("I/O exception when writing aggregation to the output buffer."); diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/graph/dataflow/AssembleKeyIntoNodeOperator.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/graph/dataflow/AssembleKeyIntoNodeOperator.java index c9f6500d7..811c7ebac 100644 --- a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/graph/dataflow/AssembleKeyIntoNodeOperator.java +++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/graph/dataflow/AssembleKeyIntoNodeOperator.java @@ -107,7 +107,8 @@ private void outputNode(Node node) throws HyracksDataException { try { builder.reset(); - builder.addField(node.marshalToByteArray(), 0, node.getSerializedLength()); + byte[] nodeBytes = node.marshalToByteArray(); + builder.addField(nodeBytes, 0, nodeBytes.length); if (!appender.append(builder.getFieldEndOffsets(), builder.getByteArray(), 0, builder.getSize())) { FrameUtils.flushFrame(writeBuffer, writer); diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/graph/dataflow/ReadsKeyValueParserFactory.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/graph/dataflow/ReadsKeyValueParserFactory.java index 8a56059d3..0160781c9 100644 --- a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/graph/dataflow/ReadsKeyValueParserFactory.java +++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/graph/dataflow/ReadsKeyValueParserFactory.java @@ -234,7 +234,8 @@ private void InsertToFrame(Kmer kmer, Node node, IFrameWriter writer) { try { tupleBuilder.reset(); tupleBuilder.addField(kmer.getBytes(), kmer.getOffset(), kmer.getLength()); - tupleBuilder.addField(node.marshalToByteArray(), 0, node.getSerializedLength()); + byte[] nodeBytes = node.marshalToByteArray(); + tupleBuilder.addField(nodeBytes, 0, nodeBytes.length); if (!outputAppender.append(tupleBuilder.getFieldEndOffsets(), tupleBuilder.getByteArray(), 0, tupleBuilder.getSize())) { From b6e4742fa23a87a37c03a1ddcb00f93f0936609b Mon Sep 17 00:00:00 2001 From: Jake Biesinger Date: Thu, 21 Nov 2013 17:51:55 -0800 Subject: [PATCH 06/12] MERGE-profile optimize: in/out formats use tmp references --- .../java/edu/uci/ics/genomix/type/Node.java | 21 +++++- .../java/edu/uci/ics/genomix/type/VKmer.java | 14 ++++ .../GenericVertexToNodeOutputFormat.java | 64 ++++++++++--------- .../NodeToGenericVertexInputFormat.java | 5 +- .../format/NodeToVertexInputFormat.java | 6 +- 5 files changed, 74 insertions(+), 36 deletions(-) diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/Node.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/Node.java index 47c30ebee..6ba797026 100644 --- a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/Node.java +++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/Node.java @@ -35,8 +35,6 @@ public class Node implements Writable, Serializable { public static final Logger LOG = Logger.getLogger(Node.class.getName()); - protected static boolean DEBUG = true; - public static List problemKmers = new ArrayList(); public enum READHEAD_ORIENTATION { UNFLIPPED((byte) 0), @@ -397,6 +395,25 @@ public int setAsReference(byte[] data, int offset) { } return offset; } + + /** + * Make a shallow copy of node. + * + * WARNING: future changes in `node`'s averageCoverage won't be reflected in `this.averageCoverage` + * @param node + */ + public void setAsReference(Node node) { + setAsReference(node.edges, node.unflippedReadIds, node.flippedReadIds, node.internalKmer, node.averageCoverage); + } + + public void setAsReference(EdgeMap[] edges, ReadHeadSet unflippedReadIds, ReadHeadSet flippedReadIds, VKmer kmer, + Float coverage) { + this.edges = edges; + this.unflippedReadIds = unflippedReadIds; + this.flippedReadIds = flippedReadIds; + this.internalKmer = kmer; + this.averageCoverage = coverage; + } public static void write(Node n, DataOutput out) throws IOException { out.writeByte(n.getActiveFields()); diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/VKmer.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/VKmer.java index b07e229fe..547f0b052 100644 --- a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/VKmer.java +++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/VKmer.java @@ -175,6 +175,20 @@ public int setAsReference(byte[] newData, int blockOffset) { setKmerLength(kRequested); return blockOffset + bytesRequested; } + + /** + * Shallow copy of the given kmer (s.t. we are backed by the same bytes) + * + * WARNING: Changes in the kmerLength after using setAsReference may not always + * be reflected in either `other` or `this`! + */ + public void setAsReference(VKmer other) { + this.bytes = other.bytes; + this.bytesUsed = other.bytesUsed; + this.kmerStartOffset = other.kmerStartOffset; + this.lettersInKmer = other.lettersInKmer; + this.storageMaxSize = other.storageMaxSize; + } /** * Reset array by kmerlength diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/format/GenericVertexToNodeOutputFormat.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/format/GenericVertexToNodeOutputFormat.java index f32c82252..5a28a8261 100644 --- a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/format/GenericVertexToNodeOutputFormat.java +++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/format/GenericVertexToNodeOutputFormat.java @@ -7,39 +7,45 @@ import org.apache.hadoop.mapreduce.TaskAttemptContext; import edu.uci.ics.genomix.pregelix.api.io.binary.BinaryVertexOutputFormat; +import edu.uci.ics.genomix.type.Kmer; import edu.uci.ics.genomix.type.Node; import edu.uci.ics.genomix.type.VKmer; import edu.uci.ics.pregelix.api.graph.Vertex; import edu.uci.ics.pregelix.api.io.VertexWriter; public abstract class GenericVertexToNodeOutputFormat extends - BinaryVertexOutputFormat { - - @Override - public VertexWriter createVertexWriter( - TaskAttemptContext context) throws IOException, - InterruptedException { - @SuppressWarnings("unchecked") - RecordWriter recordWriter = binaryOutputFormat - .getRecordWriter(context); - return new BinaryLoadGraphVertexWriter(recordWriter); - } - - /** - * Simple VertexWriter that supports {@link BinaryLoadGraphVertex} - */ - public static class BinaryLoadGraphVertexWriter extends - BinaryVertexWriter { - public BinaryLoadGraphVertexWriter( - RecordWriter lineRecordWriter) { - super(lineRecordWriter); - } - - @Override - public void writeVertex(Vertex vertex) - throws IOException, InterruptedException { - getRecordWriter().write(vertex.getVertexId(), - vertex.getVertexValue().getCopyAsNode()); - } - } + BinaryVertexOutputFormat { + + @Override + public VertexWriter createVertexWriter(TaskAttemptContext context) throws IOException, + InterruptedException { + @SuppressWarnings("unchecked") + RecordWriter recordWriter = binaryOutputFormat.getRecordWriter(context); + return new BinaryLoadGraphVertexWriter(recordWriter); + } + + /** + * Simple VertexWriter that supports {@link BinaryLoadGraphVertex} + */ + public static class BinaryLoadGraphVertexWriter extends BinaryVertexWriter { + private Node node; + + public BinaryLoadGraphVertexWriter(RecordWriter lineRecordWriter) { + super(lineRecordWriter); + node = new Node(); + } + + @Override + public void writeVertex(Vertex vertex) throws IOException, InterruptedException { + // keep only the values relevant to NodeToGenericVertexInputFormat + node.setAsReference(vertex.getVertexValue()); + + // HACK + // if the kmer hasn't changed in length, it's equivalent to the vertexId and we don't need to write it + if (node.getInternalKmer().getKmerLetterLength() == Kmer.getKmerLength()) { + node.setInternalKmer(null); + } + getRecordWriter().write(vertex.getVertexId(), node); + } + } } diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/format/NodeToGenericVertexInputFormat.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/format/NodeToGenericVertexInputFormat.java index 7be5af684..b3401b276 100644 --- a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/format/NodeToGenericVertexInputFormat.java +++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/format/NodeToGenericVertexInputFormat.java @@ -45,12 +45,13 @@ public Vertex getCurrentVertex() throws /** * set the src vertex id */ - vertexId.setAsCopy(getRecordReader().getCurrentKey()); + // okay to use setAsReference here since the caller will immediately write the values + vertexId.setAsReference(getRecordReader().getCurrentKey()); vertex.setVertexId(vertexId); /** * set the vertex value */ - vertexValue.setAsCopy(getRecordReader().getCurrentValue()); + vertexValue.setAsReference(getRecordReader().getCurrentValue()); if (vertexValue.getInternalKmer().getKmerLetterLength() == 0) // initial input directly from graph building vertexValue.setInternalKmer(vertexId); vertex.setVertexValue(vertexValue); diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/format/NodeToVertexInputFormat.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/format/NodeToVertexInputFormat.java index 2e9143e57..dd5140fc0 100644 --- a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/format/NodeToVertexInputFormat.java +++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/format/NodeToVertexInputFormat.java @@ -18,13 +18,13 @@ public class NodeToVertexInputFormat extends NodeToGenericVertexInputFormat createVertexReader(InputSplit split, TaskAttemptContext context) throws IOException { - return new BinaryDataCleanLoadGraphReaderScaffolding(binaryInputFormat.createRecordReader(split, context)); + return new BinaryDataCleanLoadGraphReaderVertexValue(binaryInputFormat.createRecordReader(split, context)); } - private class BinaryDataCleanLoadGraphReaderScaffolding extends + private class BinaryDataCleanLoadGraphReaderVertexValue extends NodeToGenericVertexInputFormat.BinaryDataCleanLoadGraphReader { - public BinaryDataCleanLoadGraphReaderScaffolding(RecordReader recordReader) { + public BinaryDataCleanLoadGraphReaderVertexValue(RecordReader recordReader) { super(recordReader); vertexValue = new VertexValueWritable(); } From f09b83a12f4e8205b3f9d91eb6ba6d0959a35ec8 Mon Sep 17 00:00:00 2001 From: Jake Biesinger Date: Thu, 21 Nov 2013 17:53:41 -0800 Subject: [PATCH 07/12] MERGE-profile optimize: less reading from conf; use global debug vals --- .../ics/genomix/config/GenomixJobConf.java | 19 +++++++-- .../pregelix/io/VertexValueWritable.java | 9 ++-- .../operator/DeBruijnGraphCleanVertex.java | 41 +++++++------------ .../pathmerge/P4ForPathMergeVertex.java | 5 ++- 4 files changed, 39 insertions(+), 35 deletions(-) diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/config/GenomixJobConf.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/config/GenomixJobConf.java index 692e8fdc8..59d2e7891 100644 --- a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/config/GenomixJobConf.java +++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/config/GenomixJobConf.java @@ -18,6 +18,7 @@ import java.util.ArrayList; import java.util.EnumSet; import java.util.HashMap; +import java.util.List; import java.util.Map; import org.apache.commons.lang3.StringUtils; @@ -30,9 +31,17 @@ import edu.uci.ics.genomix.minicluster.GenerateGraphViz.GRAPH_TYPE; import edu.uci.ics.genomix.type.EdgeMap; import edu.uci.ics.genomix.type.Kmer; +import edu.uci.ics.genomix.type.Node; +import edu.uci.ics.genomix.type.VKmer; @SuppressWarnings("deprecation") public class GenomixJobConf extends JobConf { + + public static boolean debug = false; + public static ArrayList debugKmers; + + private static Map tickTimes = new HashMap(); + /* The following section ties together command-line options with a global JobConf * Each variable has an annotated, command-line Option which is private here but * is accessible through JobConf.get(GenomixConfigOld.VARIABLE). @@ -277,8 +286,6 @@ public static void verifyPatterns(Patterns[] patterns) { public static final String STATS_MIN_CONTIGLENGTH = "genomix.conf.minContigLength"; // intermediate date evaluation - private static Map tickTimes = new HashMap(); - public GenomixJobConf(int kmerLength) { super(new Configuration()); setInt(KMER_LENGTH, kmerLength); @@ -512,7 +519,13 @@ public static long tock(String counter) { public static void setGlobalStaticConstants(Configuration conf) { Kmer.setGlobalKmerLength(Integer.parseInt(conf.get(GenomixJobConf.KMER_LENGTH))); // EdgeWritable.MAX_READ_IDS_PER_EDGE = Integer.parseInt(conf.get(GenomixJobConf.MAX_READIDS_PER_EDGE)); - EdgeMap.logReadIds = Boolean.parseBoolean(conf.get(GenomixJobConf.LOG_READIDS)); + debug = conf.get(GenomixJobConf.DEBUG_KMERS) != null; + debugKmers = new ArrayList(); + if (conf.get(GenomixJobConf.DEBUG_KMERS) != null) { + for (String kmer : conf.get(GenomixJobConf.DEBUG_KMERS).split(",")) { + debugKmers.add(new VKmer(kmer)); + } + } } } diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/io/VertexValueWritable.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/io/VertexValueWritable.java index a472f5abd..a3be07309 100644 --- a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/io/VertexValueWritable.java +++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/io/VertexValueWritable.java @@ -4,6 +4,7 @@ import java.io.DataOutput; import java.io.IOException; +import edu.uci.ics.genomix.config.GenomixJobConf; import edu.uci.ics.genomix.pregelix.io.common.ByteWritable; import edu.uci.ics.genomix.pregelix.io.common.HashMapWritable; import edu.uci.ics.genomix.pregelix.io.common.VLongWritable; @@ -46,6 +47,8 @@ public static class State extends VertexStateFlag { private short state; private boolean isFakeVertex; private HashMapWritable counters; + + protected boolean verbose = false; public VertexValueWritable() { super(); @@ -156,9 +159,9 @@ public void readFields(DataInput in) throws IOException { // this.counters.readFields(in); // scaffoldingMap.readFields(in); - if (DEBUG) { - boolean verbose = false; - for (VKmer problemKmer : problemKmers) { + if (GenomixJobConf.debug) { + verbose = false; + for (VKmer problemKmer : GenomixJobConf.debugKmers) { verbose |= this.getInternalKmer().equals(problemKmer); verbose |= findEdge(problemKmer) != null; } diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/DeBruijnGraphCleanVertex.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/DeBruijnGraphCleanVertex.java index dcd918ba1..f82677ce5 100644 --- a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/DeBruijnGraphCleanVertex.java +++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/DeBruijnGraphCleanVertex.java @@ -45,7 +45,7 @@ public abstract class DeBruijnGraphCleanVertexB--et2-->C with et1 being the first dimension and et2 being // the second - public EDGETYPE[][] validPathsTable = new EDGETYPE[][] { { EDGETYPE.RF, EDGETYPE.FF }, { EDGETYPE.RF, EDGETYPE.FR }, + public static final EDGETYPE[][] validPathsTable = new EDGETYPE[][] { { EDGETYPE.RF, EDGETYPE.FF }, { EDGETYPE.RF, EDGETYPE.FR }, { EDGETYPE.RR, EDGETYPE.FF }, { EDGETYPE.RR, EDGETYPE.FR } }; protected M outgoingMsg = null; @@ -55,8 +55,6 @@ public abstract class DeBruijnGraphCleanVertex problemKmers = null; - protected boolean debug = false; protected boolean verbose = false; protected boolean logReadIds = false; @@ -66,33 +64,22 @@ public abstract class DeBruijnGraphCleanVertex(); - if (getContext().getConfiguration().get(GenomixJobConf.DEBUG_KMERS) != null) { - for (String kmer : getContext().getConfiguration().get(GenomixJobConf.DEBUG_KMERS).split(",")) { - problemKmers.add(new VKmer(kmer)); - } - Node.problemKmers = problemKmers; + if (getSuperstep() == 1) { + if (kmerSize == -1) { + kmerSize = Integer.parseInt(getContext().getConfiguration().get(GenomixJobConf.KMER_LENGTH)); } + if (maxIteration < 0) { + maxIteration = Integer.parseInt(getContext().getConfiguration().get( + GenomixJobConf.GRAPH_CLEAN_MAX_ITERATIONS)); + } + GenomixJobConf.setGlobalStaticConstants(getContext().getConfiguration()); } - + verbose = false; - for (VKmer problemKmer : problemKmers) { - verbose |= debug - && (getVertexValue().findEdge(problemKmer) != null || getVertexId().equals(problemKmer)); + if (GenomixJobConf.debug) { + for (VKmer debugKmer : GenomixJobConf.debugKmers) { + verbose |= (getVertexValue().findEdge(debugKmer) != null || getVertexId().equals(debugKmer)); + } } } diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/P4ForPathMergeVertex.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/P4ForPathMergeVertex.java index 96f910ceb..9bb9106f2 100644 --- a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/P4ForPathMergeVertex.java +++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/P4ForPathMergeVertex.java @@ -25,7 +25,7 @@ public class P4ForPathMergeVertex extends BasicPathMergeVertex Date: Fri, 22 Nov 2013 12:46:25 -0800 Subject: [PATCH 08/12] MERGE-profile optimize: random uses hash, not many calls, to break correlation --- .../ics/genomix/util/HashedSeedRandom.java | 30 +++++++++++++++++++ .../pathmerge/P4ForPathMergeVertex.java | 15 +++++----- 2 files changed, 37 insertions(+), 8 deletions(-) create mode 100644 genomix/genomix-data/src/main/java/edu/uci/ics/genomix/util/HashedSeedRandom.java diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/util/HashedSeedRandom.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/util/HashedSeedRandom.java new file mode 100644 index 000000000..72750d8f6 --- /dev/null +++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/util/HashedSeedRandom.java @@ -0,0 +1,30 @@ +package edu.uci.ics.genomix.util; + +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; +import java.util.Random; + +/** + * A random number generator that hashes its seeds to break any correlation in similar seeds + */ +@SuppressWarnings("serial") +public class HashedSeedRandom extends Random { + MessageDigest md; + + @Override + public void setSeed(long seed) { + if (md == null) { + try { + md = MessageDigest.getInstance("SHA1"); + } catch (NoSuchAlgorithmException e) { + throw new RuntimeException(e); + } + } else { + md.reset(); + } + for (int i = 0; i < Long.SIZE; i += Byte.SIZE) { + md.update((byte) (seed >>> i)); + } + super.setSeed(Marshal.getLong(md.digest(), 0)); + } +} diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/P4ForPathMergeVertex.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/P4ForPathMergeVertex.java index 9bb9106f2..d2565a3af 100644 --- a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/P4ForPathMergeVertex.java +++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/P4ForPathMergeVertex.java @@ -16,7 +16,7 @@ import edu.uci.ics.genomix.type.EDGETYPE; import edu.uci.ics.genomix.type.Node; import edu.uci.ics.genomix.type.VKmer; -import edu.uci.ics.hyracks.api.exceptions.HyracksDataException; +import edu.uci.ics.genomix.util.HashedSeedRandom; /** * Graph clean pattern: P4(Smart-algorithm) for path merge @@ -43,6 +43,7 @@ public class P4ForPathMergeVertex extends BasicPathMergeVertex msgIterator) { } @Override - public void compute(Iterator msgIterator) throws HyracksDataException { + public void compute(Iterator msgIterator) { initVertex(); if (verbose) LOG.fine("Iteration " + getSuperstep() + " for key " + getVertexId()); From 878a6b0234af008f3e068f8d096eb7af2af14ff0 Mon Sep 17 00:00:00 2001 From: Jake Biesinger Date: Sun, 24 Nov 2013 00:07:36 -0800 Subject: [PATCH 09/12] MERGE-profile optimize: don't send unnecessary edges in MERGE --- .../java/edu/uci/ics/genomix/type/EDGETYPE.java | 17 +++++++++++++++++ .../java/edu/uci/ics/genomix/type/Node.java | 1 - .../pathmerge/BasicPathMergeVertex.java | 13 +++++++++++-- 3 files changed, 28 insertions(+), 3 deletions(-) diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/EDGETYPE.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/EDGETYPE.java index 2033f6115..6505fa983 100644 --- a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/EDGETYPE.java +++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/EDGETYPE.java @@ -109,6 +109,23 @@ public static DIR dir(EDGETYPE edgeType) { // .dir static / non-static throw new RuntimeException("Unrecognized direction in dirFromEdgeType: " + edgeType); } } + + public DIR neighborDir() { + return neighborDir(this); + } + + public static DIR neighborDir(EDGETYPE et) { + switch (et) { + case FF: + case RF: + return DIR.FORWARD; + case FR: + case RR: + return DIR.REVERSE; + default: + throw new RuntimeException("Unrecognized direction in dirFromEdgeType: " + et); + } + } /** * return the edgetype corresponding to moving across edge1 and edge2. diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/Node.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/Node.java index 6ba797026..063ceacfa 100644 --- a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/Node.java +++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/Node.java @@ -254,7 +254,6 @@ public void setEdgeMap(EDGETYPE edgeType, EdgeMap edgeMap) { if (edgeMap == null) { edges[edgeType.get()] = null; } else { - getEdgeMap(edgeType).clear(); getEdgeMap(edgeType).setAsCopy(edgeMap); } } diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/BasicPathMergeVertex.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/BasicPathMergeVertex.java index a60b2dcdf..b842f3f33 100644 --- a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/BasicPathMergeVertex.java +++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/BasicPathMergeVertex.java @@ -17,6 +17,7 @@ import edu.uci.ics.genomix.type.DIR; import edu.uci.ics.genomix.type.EDGETYPE; import edu.uci.ics.genomix.type.EdgeMap; +import edu.uci.ics.genomix.type.Node; import edu.uci.ics.genomix.type.VKmer; /** @@ -183,8 +184,16 @@ public void sendMergeMsg() { .enumSetFromByte(state)); outgoingMsg.setFlag((short) (mergeEdgetype.mirror().get() | neighborRestrictions)); - outgoingMsg.setSourceVertexId(getVertexId()); - outgoingMsg.setNode(vertex); // TODO reduce amount sent in this Node (only internalKmer and 1/2 of edges) + Node outNode = outgoingMsg.getNode(); + // set only relevant edges + for (EDGETYPE et : mergeEdgetype.mirror().neighborDir().edgeTypes()) { + outNode.setEdgeMap(et, vertex.getEdgeMap(et)); + } + outNode.setUnflippedReadIds(vertex.getUnflippedReadIds()); + outNode.setFlippedReadIds(vertex.getFlippedReadIds()); + outNode.setAverageCoverage(vertex.getAverageCoverage()); + outNode.getInternalKmer().setAsCopy(vertex.getInternalKmer()); + if (vertex.degree(mergeEdgetype.dir()) != 1) throw new IllegalStateException("Merge attempted in node with degree in " + mergeEdgetype + " direction != 1!\n" + vertex); From 56e3b9b1f36a21320227b8e960fd887f9c9dca5b Mon Sep 17 00:00:00 2001 From: Jake Biesinger Date: Sun, 24 Nov 2013 00:09:11 -0800 Subject: [PATCH 10/12] MERGE-profile optimize: only send non-overlapping parts of kmers --- .../operator/pathmerge/BasicPathMergeVertex.java | 9 ++++++++- .../operator/pathmerge/P4ForPathMergeVertex.java | 4 +++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/BasicPathMergeVertex.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/BasicPathMergeVertex.java index b842f3f33..f2e28f33f 100644 --- a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/BasicPathMergeVertex.java +++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/BasicPathMergeVertex.java @@ -17,6 +17,7 @@ import edu.uci.ics.genomix.type.DIR; import edu.uci.ics.genomix.type.EDGETYPE; import edu.uci.ics.genomix.type.EdgeMap; +import edu.uci.ics.genomix.type.Kmer; import edu.uci.ics.genomix.type.Node; import edu.uci.ics.genomix.type.VKmer; @@ -192,7 +193,13 @@ public void sendMergeMsg() { outNode.setUnflippedReadIds(vertex.getUnflippedReadIds()); outNode.setFlippedReadIds(vertex.getFlippedReadIds()); outNode.setAverageCoverage(vertex.getAverageCoverage()); - outNode.getInternalKmer().setAsCopy(vertex.getInternalKmer()); + // only send non-overlapping letters // TODO do something more efficient than toString? + if (mergeEdgetype.mirror().neighborDir() == DIR.FORWARD) { + outNode.getInternalKmer().setAsCopy(vertex.getInternalKmer().toString().substring(Kmer.getKmerLength() - 1)); + } else { + outNode.getInternalKmer().setAsCopy(vertex.getInternalKmer().toString().substring(0, vertex.getInternalKmer().getKmerLetterLength() - Kmer.getKmerLength() + 1)); + } + if (vertex.degree(mergeEdgetype.dir()) != 1) throw new IllegalStateException("Merge attempted in node with degree in " + mergeEdgetype diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/P4ForPathMergeVertex.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/P4ForPathMergeVertex.java index d2565a3af..e342d593c 100644 --- a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/P4ForPathMergeVertex.java +++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/P4ForPathMergeVertex.java @@ -205,7 +205,9 @@ public void receiveMerges(Iterator msgIterator) { LOG.fine("before merge: " + getVertexValue() + " restrictions: " + DIR.enumSetFromByte(state)); } senderEdgetype = EDGETYPE.fromByte(incomingMsg.getFlag()); - node.mergeWithNode(senderEdgetype, incomingMsg.getNode()); + node.mergeWithNodeWithoutKmer(senderEdgetype, incomingMsg.getNode()); + // only the non-overlapping portions of the kmer are sent + node.getInternalKmer().mergeWithKmerInDir(senderEdgetype, 1, incomingMsg.getNode().getInternalKmer()); state |= (byte) (incomingMsg.getFlag() & DIR.MASK); // update incoming restricted directions // numMerged++; updated = true; From 3c0a13046269de8c5925b90b3cc3f08631e394ec Mon Sep 17 00:00:00 2001 From: Jake Biesinger Date: Mon, 25 Nov 2013 15:04:25 -0800 Subject: [PATCH 11/12] fix coverage bug form short kmers in MERGE messages --- .../src/main/java/edu/uci/ics/genomix/type/Node.java | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/Node.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/Node.java index 063ceacfa..fc32bf67a 100644 --- a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/Node.java +++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/Node.java @@ -281,14 +281,17 @@ public void setAverageCoverage(float averageCoverage) { * paths. */ public void mergeCoverage(Node other) { + mergeCoverage(other, other.internalKmer.getKmerLetterLength()); + } + + public void mergeCoverage(Node other, int otherOriginalLength) { // sequence considered in the average doesn't include anything // overlapping with other kmers - float adjustedLength = internalKmer.getKmerLetterLength() + other.internalKmer.getKmerLetterLength() - - (Kmer.getKmerLength() - 1) * 2; + float adjustedLength = internalKmer.getKmerLetterLength() + otherOriginalLength - (Kmer.getKmerLength() - 1) + * 2; float myCount = (internalKmer.getKmerLetterLength() - Kmer.getKmerLength() + 1) * averageCoverage; - float otherCount = (other.internalKmer.getKmerLetterLength() - Kmer.getKmerLength() + 1) - * other.averageCoverage; + float otherCount = (otherOriginalLength - Kmer.getKmerLength() + 1) * other.averageCoverage; averageCoverage = (myCount + otherCount) / adjustedLength; } From f3fb2613df7ec0a11aa0ab5ad39efd2a31367278 Mon Sep 17 00:00:00 2001 From: Jake Biesinger Date: Mon, 2 Dec 2013 15:23:16 -0800 Subject: [PATCH 12/12] fix bug in coverage; simplify merging with truncated kmers in messages --- .../main/java/edu/uci/ics/genomix/type/Node.java | 9 +++++++++ .../operator/pathmerge/P4ForPathMergeVertex.java | 14 +++++++++++--- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/Node.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/Node.java index fc32bf67a..eab8b6ae8 100644 --- a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/Node.java +++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/Node.java @@ -575,6 +575,15 @@ public void mergeWithNodeWithoutKmer(EDGETYPE edgeType, final Node other) { mergeUnflippedAndFlippedReadIDs(edgeType, other); mergeCoverage(other); } + + public void mergeWithNodeUsingTruncatedKmer(EDGETYPE edgeType, Node other) { + mergeEdges(edgeType, other); + mergeUnflippedAndFlippedReadIDs(edgeType, other); + + // only the non-overlapping portions of the kmer were sent-- coverage and kmer merge handled differently as a result + mergeCoverage(other, other.internalKmer.getKmerLetterLength() + Kmer.getKmerLength() - 1); + getInternalKmer().mergeWithKmerInDir(edgeType, 1, other.getInternalKmer()); + } /** * merge all metadata from `other` into this, as if `other` were the same node as this. diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/P4ForPathMergeVertex.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/P4ForPathMergeVertex.java index e342d593c..eaf2707d2 100644 --- a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/P4ForPathMergeVertex.java +++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/P4ForPathMergeVertex.java @@ -205,9 +205,7 @@ public void receiveMerges(Iterator msgIterator) { LOG.fine("before merge: " + getVertexValue() + " restrictions: " + DIR.enumSetFromByte(state)); } senderEdgetype = EDGETYPE.fromByte(incomingMsg.getFlag()); - node.mergeWithNodeWithoutKmer(senderEdgetype, incomingMsg.getNode()); - // only the non-overlapping portions of the kmer are sent - node.getInternalKmer().mergeWithKmerInDir(senderEdgetype, 1, incomingMsg.getNode().getInternalKmer()); + node.mergeWithNodeUsingTruncatedKmer(senderEdgetype, incomingMsg.getNode()); state |= (byte) (incomingMsg.getFlag() & DIR.MASK); // update incoming restricted directions // numMerged++; updated = true; @@ -239,6 +237,12 @@ public void receiveMerges(Iterator msgIterator) { @Override public void compute(Iterator msgIterator) { initVertex(); + if (Float.isInfinite(getVertexValue().getAverageCoverage()) || Float.isNaN(getVertexValue().getAverageCoverage())) { + System.out.println("Before: " + getVertexValue()); + } + if (getVertexId().toString().equals("AGCGCAAGG")) { + System.out.println(); + } if (verbose) LOG.fine("Iteration " + getSuperstep() + " for key " + getVertexId()); if (getSuperstep() > maxIteration) { // TODO should we make sure the graph is complete or allow interruptions that will cause an asymmetric graph? @@ -260,6 +264,10 @@ public void compute(Iterator msgIterator) { receiveUpdates(msgIterator); sendMergeMsg(); } + if (Float.isInfinite(getVertexValue().getAverageCoverage()) || Float.isNaN(getVertexValue().getAverageCoverage())) { + System.out.println("after: " + getVertexValue()); + throw new RuntimeException(this.toString()); + } } public static void main(String[] args) throws Exception {