From 9d84f84543cf51d362293a7bd11094ebc8b8ee68 Mon Sep 17 00:00:00 2001 From: siddharth Date: Fri, 22 Dec 2017 11:24:43 -0800 Subject: [PATCH] ARROW-1943: [JAVA] handle setInitialCapacity for deeply nested lists The current implementation of setInitialCapacity() uses a factor of 5 for every level we go into list: So if the schema is LIST (LIST (LIST (LIST (LIST (LIST (LIST (BIGINT)))))) and we start with an initial capacity of 128, we end up throwing OversizedAllocationException from the BigIntVector because at every level we increased the capacity by 5 and by the time we reached inner scalar that actually stores the data, we were well over max size limit per vector (1MB). We saw this problem downstream when we failed to read deeply nested JSON data. The potential fix is to use the factor of 5 only when we are down to the leaf vector. As the depth increases and we are still working with complex/list, we don't use the factor of 5. cc @jacques-n , @BryanCutler , @icexelloss Author: siddharth Closes #1439 from siddharthteotia/ARROW-1943 and squashes the following commits: d0adbade [siddharth] unit tests e2f21a8b [siddharth] fix imports d103436b [siddharth] ARROW-1943: handle setInitialCapacity for deeply nested lists --- .../complex/BaseRepeatedValueVector.java | 8 +- .../apache/arrow/vector/TestListVector.java | 128 ++++++++++++++++++ 2 files changed, 135 insertions(+), 1 deletion(-) diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java index 4648d078949a7..a909828602aff 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java @@ -29,6 +29,8 @@ import org.apache.arrow.vector.UInt4Vector; import org.apache.arrow.vector.ValueVector; import org.apache.arrow.vector.ZeroVector; +import org.apache.arrow.vector.BaseNullableVariableWidthVector; +import org.apache.arrow.vector.BaseNullableFixedWidthVector; import org.apache.arrow.vector.types.pojo.ArrowType.ArrowTypeID; import org.apache.arrow.vector.types.pojo.FieldType; import org.apache.arrow.vector.util.CallBack; @@ -134,7 +136,11 @@ public FieldVector getDataVector() { @Override public void setInitialCapacity(int numRecords) { offsetAllocationSizeInBytes = (numRecords + 1) * OFFSET_WIDTH; - vector.setInitialCapacity(numRecords * RepeatedValueVector.DEFAULT_REPEAT_PER_RECORD); + if (vector instanceof BaseNullableFixedWidthVector || vector instanceof BaseNullableVariableWidthVector) { + vector.setInitialCapacity(numRecords * RepeatedValueVector.DEFAULT_REPEAT_PER_RECORD); + } else { + vector.setInitialCapacity(numRecords); + } } @Override diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java index f6aa86a300876..ce9fb4442550a 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java @@ -559,6 +559,134 @@ public void testNestedListVector() throws Exception { } } + @Test + public void testNestedListVector1() throws Exception { + try (ListVector listVector = ListVector.empty("sourceVector", allocator)) { + + MinorType listType = MinorType.LIST; + MinorType scalarType = MinorType.BIGINT; + + listVector.addOrGetVector(FieldType.nullable(listType.getType())); + + ListVector innerList1 = (ListVector)listVector.getDataVector(); + innerList1.addOrGetVector(FieldType.nullable(listType.getType())); + + ListVector innerList2 = (ListVector)innerList1.getDataVector(); + innerList2.addOrGetVector(FieldType.nullable(listType.getType())); + + ListVector innerList3 = (ListVector)innerList2.getDataVector(); + innerList3.addOrGetVector(FieldType.nullable(listType.getType())); + + ListVector innerList4 = (ListVector)innerList3.getDataVector(); + innerList4.addOrGetVector(FieldType.nullable(listType.getType())); + + ListVector innerList5 = (ListVector)innerList4.getDataVector(); + innerList5.addOrGetVector(FieldType.nullable(listType.getType())); + + ListVector innerList6 = (ListVector)innerList5.getDataVector(); + innerList6.addOrGetVector(FieldType.nullable(scalarType.getType())); + + listVector.setInitialCapacity(128); + } + } + + @Test + public void testNestedListVector2() throws Exception { + try (ListVector listVector = ListVector.empty("sourceVector", allocator)) { + listVector.setInitialCapacity(1); + UnionListWriter listWriter = listVector.getWriter(); + /* allocate memory */ + listWriter.allocate(); + + /* write one or more inner lists at index 0 */ + listWriter.setPosition(0); + listWriter.startList(); + + listWriter.list().startList(); + listWriter.list().bigInt().writeBigInt(50); + listWriter.list().bigInt().writeBigInt(100); + listWriter.list().bigInt().writeBigInt(200); + listWriter.list().endList(); + + listWriter.list().startList(); + listWriter.list().bigInt().writeBigInt(75); + listWriter.list().bigInt().writeBigInt(125); + listWriter.list().endList(); + + listWriter.endList(); + + /* write one or more inner lists at index 1 */ + listWriter.setPosition(1); + listWriter.startList(); + + listWriter.list().startList(); + listWriter.list().bigInt().writeBigInt(15); + listWriter.list().bigInt().writeBigInt(20); + listWriter.list().endList(); + + listWriter.list().startList(); + listWriter.list().bigInt().writeBigInt(25); + listWriter.list().bigInt().writeBigInt(30); + listWriter.list().bigInt().writeBigInt(35); + listWriter.list().endList(); + + listWriter.endList(); + + assertEquals(2, listVector.getLastSet()); + + listVector.setValueCount(2); + + assertEquals(2, listVector.getValueCount()); + + /* get listVector value at index 0 -- the value itself is a listvector */ + Object result = listVector.getObject(0); + ArrayList> resultSet = (ArrayList>) result; + ArrayList list; + + assertEquals(2, resultSet.size()); /* 2 inner lists at index 0 */ + assertEquals(3, resultSet.get(0).size()); /* size of first inner list */ + assertEquals(2, resultSet.get(1).size()); /* size of second inner list */ + + list = resultSet.get(0); + assertEquals(new Long(50), list.get(0)); + assertEquals(new Long(100), list.get(1)); + assertEquals(new Long(200), list.get(2)); + + list = resultSet.get(1); + assertEquals(new Long(75), list.get(0)); + assertEquals(new Long(125), list.get(1)); + + /* get listVector value at index 1 -- the value itself is a listvector */ + result = listVector.getObject(1); + resultSet = (ArrayList>) result; + + assertEquals(2, resultSet.size()); /* 3 inner lists at index 1 */ + assertEquals(2, resultSet.get(0).size()); /* size of first inner list */ + assertEquals(3, resultSet.get(1).size()); /* size of second inner list */ + + list = resultSet.get(0); + assertEquals(new Long(15), list.get(0)); + assertEquals(new Long(20), list.get(1)); + + list = resultSet.get(1); + assertEquals(new Long(25), list.get(0)); + assertEquals(new Long(30), list.get(1)); + assertEquals(new Long(35), list.get(2)); + + /* check underlying bitVector */ + assertFalse(listVector.isNull(0)); + assertFalse(listVector.isNull(1)); + + /* check underlying offsets */ + final ArrowBuf offsetBuffer = listVector.getOffsetBuffer(); + + /* listVector has 2 lists at index 0 and 3 lists at index 1 */ + assertEquals(0, offsetBuffer.getInt(0 * ListVector.OFFSET_WIDTH)); + assertEquals(2, offsetBuffer.getInt(1 * ListVector.OFFSET_WIDTH)); + assertEquals(4, offsetBuffer.getInt(2 * ListVector.OFFSET_WIDTH)); + } + } + @Test public void testGetBufferAddress() throws Exception { try (ListVector listVector = ListVector.empty("vector", allocator)) {