Skip to content

Commit

Permalink
ARROW-1943: [JAVA] handle setInitialCapacity for deeply nested lists
Browse files Browse the repository at this point in the history
The current implementation of setInitialCapacity() uses a factor of 5 for every level we go into list:

So if the schema is LIST (LIST (LIST (LIST (LIST (LIST (LIST (BIGINT)))))) and we start with an initial capacity
of 128, we end up throwing OversizedAllocationException from the BigIntVector because at every level we increased
the capacity by 5 and by the time we reached inner scalar that actually stores the data, we were well over max size limit per vector (1MB).

We saw this problem downstream when we failed to read deeply nested JSON data.

The potential fix is to use the factor of 5 only when we are down to the leaf vector. As the depth increases and we are still working with complex/list, we don't use the factor of 5.

cc @jacques-n , @BryanCutler , @icexelloss

Author: siddharth <[email protected]>

Closes apache#1439 from siddharthteotia/ARROW-1943 and squashes the following commits:

d0adbad [siddharth] unit tests
e2f21a8 [siddharth] fix imports
d103436 [siddharth] ARROW-1943: handle setInitialCapacity for deeply nested lists
  • Loading branch information
siddharthteotia committed Dec 22, 2017
1 parent 41b11bf commit 9d84f84
Show file tree
Hide file tree
Showing 2 changed files with 135 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
import org.apache.arrow.vector.UInt4Vector;
import org.apache.arrow.vector.ValueVector;
import org.apache.arrow.vector.ZeroVector;
import org.apache.arrow.vector.BaseNullableVariableWidthVector;
import org.apache.arrow.vector.BaseNullableFixedWidthVector;
import org.apache.arrow.vector.types.pojo.ArrowType.ArrowTypeID;
import org.apache.arrow.vector.types.pojo.FieldType;
import org.apache.arrow.vector.util.CallBack;
Expand Down Expand Up @@ -134,7 +136,11 @@ public FieldVector getDataVector() {
@Override
public void setInitialCapacity(int numRecords) {
offsetAllocationSizeInBytes = (numRecords + 1) * OFFSET_WIDTH;
vector.setInitialCapacity(numRecords * RepeatedValueVector.DEFAULT_REPEAT_PER_RECORD);
if (vector instanceof BaseNullableFixedWidthVector || vector instanceof BaseNullableVariableWidthVector) {
vector.setInitialCapacity(numRecords * RepeatedValueVector.DEFAULT_REPEAT_PER_RECORD);
} else {
vector.setInitialCapacity(numRecords);
}
}

@Override
Expand Down
128 changes: 128 additions & 0 deletions java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java
Original file line number Diff line number Diff line change
Expand Up @@ -559,6 +559,134 @@ public void testNestedListVector() throws Exception {
}
}

@Test
public void testNestedListVector1() throws Exception {
try (ListVector listVector = ListVector.empty("sourceVector", allocator)) {

MinorType listType = MinorType.LIST;
MinorType scalarType = MinorType.BIGINT;

listVector.addOrGetVector(FieldType.nullable(listType.getType()));

ListVector innerList1 = (ListVector)listVector.getDataVector();
innerList1.addOrGetVector(FieldType.nullable(listType.getType()));

ListVector innerList2 = (ListVector)innerList1.getDataVector();
innerList2.addOrGetVector(FieldType.nullable(listType.getType()));

ListVector innerList3 = (ListVector)innerList2.getDataVector();
innerList3.addOrGetVector(FieldType.nullable(listType.getType()));

ListVector innerList4 = (ListVector)innerList3.getDataVector();
innerList4.addOrGetVector(FieldType.nullable(listType.getType()));

ListVector innerList5 = (ListVector)innerList4.getDataVector();
innerList5.addOrGetVector(FieldType.nullable(listType.getType()));

ListVector innerList6 = (ListVector)innerList5.getDataVector();
innerList6.addOrGetVector(FieldType.nullable(scalarType.getType()));

listVector.setInitialCapacity(128);
}
}

@Test
public void testNestedListVector2() throws Exception {
try (ListVector listVector = ListVector.empty("sourceVector", allocator)) {
listVector.setInitialCapacity(1);
UnionListWriter listWriter = listVector.getWriter();
/* allocate memory */
listWriter.allocate();

/* write one or more inner lists at index 0 */
listWriter.setPosition(0);
listWriter.startList();

listWriter.list().startList();
listWriter.list().bigInt().writeBigInt(50);
listWriter.list().bigInt().writeBigInt(100);
listWriter.list().bigInt().writeBigInt(200);
listWriter.list().endList();

listWriter.list().startList();
listWriter.list().bigInt().writeBigInt(75);
listWriter.list().bigInt().writeBigInt(125);
listWriter.list().endList();

listWriter.endList();

/* write one or more inner lists at index 1 */
listWriter.setPosition(1);
listWriter.startList();

listWriter.list().startList();
listWriter.list().bigInt().writeBigInt(15);
listWriter.list().bigInt().writeBigInt(20);
listWriter.list().endList();

listWriter.list().startList();
listWriter.list().bigInt().writeBigInt(25);
listWriter.list().bigInt().writeBigInt(30);
listWriter.list().bigInt().writeBigInt(35);
listWriter.list().endList();

listWriter.endList();

assertEquals(2, listVector.getLastSet());

listVector.setValueCount(2);

assertEquals(2, listVector.getValueCount());

/* get listVector value at index 0 -- the value itself is a listvector */
Object result = listVector.getObject(0);
ArrayList<ArrayList<Long>> resultSet = (ArrayList<ArrayList<Long>>) result;
ArrayList<Long> list;

assertEquals(2, resultSet.size()); /* 2 inner lists at index 0 */
assertEquals(3, resultSet.get(0).size()); /* size of first inner list */
assertEquals(2, resultSet.get(1).size()); /* size of second inner list */

list = resultSet.get(0);
assertEquals(new Long(50), list.get(0));
assertEquals(new Long(100), list.get(1));
assertEquals(new Long(200), list.get(2));

list = resultSet.get(1);
assertEquals(new Long(75), list.get(0));
assertEquals(new Long(125), list.get(1));

/* get listVector value at index 1 -- the value itself is a listvector */
result = listVector.getObject(1);
resultSet = (ArrayList<ArrayList<Long>>) result;

assertEquals(2, resultSet.size()); /* 3 inner lists at index 1 */
assertEquals(2, resultSet.get(0).size()); /* size of first inner list */
assertEquals(3, resultSet.get(1).size()); /* size of second inner list */

list = resultSet.get(0);
assertEquals(new Long(15), list.get(0));
assertEquals(new Long(20), list.get(1));

list = resultSet.get(1);
assertEquals(new Long(25), list.get(0));
assertEquals(new Long(30), list.get(1));
assertEquals(new Long(35), list.get(2));

/* check underlying bitVector */
assertFalse(listVector.isNull(0));
assertFalse(listVector.isNull(1));

/* check underlying offsets */
final ArrowBuf offsetBuffer = listVector.getOffsetBuffer();

/* listVector has 2 lists at index 0 and 3 lists at index 1 */
assertEquals(0, offsetBuffer.getInt(0 * ListVector.OFFSET_WIDTH));
assertEquals(2, offsetBuffer.getInt(1 * ListVector.OFFSET_WIDTH));
assertEquals(4, offsetBuffer.getInt(2 * ListVector.OFFSET_WIDTH));
}
}

@Test
public void testGetBufferAddress() throws Exception {
try (ListVector listVector = ListVector.empty("vector", allocator)) {
Expand Down

0 comments on commit 9d84f84

Please sign in to comment.