5
0
mirror of https://github.com/apache/sqoop.git synced 2025-05-10 22:13:07 +08:00

SQOOP-1825: Sqoop2: Handle NULLs for all types in CSV Intermediate Data Format

(Veena Basavaraj via Abraham Elmahrek)
This commit is contained in:
Abraham Elmahrek 2014-12-12 11:51:00 -06:00
parent 546f861527
commit 1fc6589221
2 changed files with 151 additions and 40 deletions

View File

@ -67,7 +67,7 @@ public class CSVIntermediateDataFormat extends IntermediateDataFormat<String> {
public static final char ESCAPE_CHARACTER = '\\';
public static final char QUOTE_CHARACTER = '\'';
public static final String NULL_STRING = "NULL";
public static final String NULL_VALUE = "NULL";
private static final char[] originals = {
0x5C,0x00,0x0A,0x0D,0x1A,0x22,0x27
@ -249,7 +249,7 @@ public Object[] getObjectData() {
Column[] columnArray = schema.getColumns().toArray(new Column[fieldStringArray.length]);
for (int i = 0; i < fieldStringArray.length; i++) {
// check for NULL field and bail out immediately
if (fieldStringArray[i].equals("NULL")) {
if (fieldStringArray[i].equals(NULL_VALUE)) {
objectArray[i] = null;
continue;
}
@ -410,8 +410,17 @@ else if (value instanceof JSONObject) {
*/
@Override
public void setObjectData(Object[] data) {
Set<Integer> nullValueIndices = new HashSet<Integer>();
Column[] columnArray = schema.getColumns().toArray(new Column[data.length]);
encodeCSVStringElements(data, columnArray);
// check for null
for (int i = 0; i < data.length; i++) {
if (data[i] == null) {
nullValueIndices.add(i);
data[i] = NULL_VALUE;
}
}
// ignore the null values while encoding the object array into csv string
encodeCSVStringElements(data, columnArray, nullValueIndices);
this.data = StringUtils.join(data, SEPARATOR_CHARACTER);
}
@ -465,19 +474,25 @@ public int compareTo(IntermediateDataFormat<?> o) {
* @param objectArray
*/
@SuppressWarnings("unchecked")
private void encodeCSVStringElements(Object[] objectArray, Column[] columnArray) {
private void encodeCSVStringElements(Object[] objectArray, Column[] columnArray, Set<Integer> nullValueIndices) {
for (int i : bitTypeColumnIndices) {
if (!nullValueIndices.contains(i)) {
String bitStringValue = objectArray[i].toString();
if ((TRUE_BIT_SET.contains(bitStringValue)) || (FALSE_BIT_SET.contains(bitStringValue))) {
objectArray[i] = bitStringValue;
} else {
throw new SqoopException(CSVIntermediateDataFormatError.CSV_INTERMEDIATE_DATA_FORMAT_0009, " given bit value: " + objectArray[i]);
throw new SqoopException(CSVIntermediateDataFormatError.CSV_INTERMEDIATE_DATA_FORMAT_0009, " given bit value: "
+ objectArray[i]);
}
}
}
for (int i : stringTypeColumnIndices) {
if (!nullValueIndices.contains(i)) {
objectArray[i] = escapeString((String) objectArray[i]);
}
}
for (int i : dateTimeTypeColumnIndices) {
if (!nullValueIndices.contains(i)) {
Column col = columnArray[i];
if (objectArray[i] instanceof org.joda.time.DateTime) {
org.joda.time.DateTime dateTime = (org.joda.time.DateTime) objectArray[i];
@ -488,28 +503,39 @@ private void encodeCSVStringElements(Object[] objectArray, Column[] columnArray)
formatLocalDateTime(objectArray, i, col, localDateTime);
}
}
}
for (int i : dateTypeColumnIndices) {
if (!nullValueIndices.contains(i)) {
org.joda.time.LocalDate date = (org.joda.time.LocalDate) objectArray[i];
objectArray[i] = encloseWithQuote(df.print(date));
}
}
for (int i : timeColumnIndices) {
Column col = columnArray[i];
if (!nullValueIndices.contains(i)) {
if (((org.apache.sqoop.schema.type.Time) col).hasFraction()) {
objectArray[i] = encloseWithQuote(tfWithFraction.print((org.joda.time.LocalTime) objectArray[i]));
} else {
objectArray[i] = encloseWithQuote(tfWithNoFraction.print((org.joda.time.LocalTime) objectArray[i]));
}
}
}
for (int i : byteTypeColumnIndices) {
if (!nullValueIndices.contains(i)) {
objectArray[i] = escapeByteArrays((byte[]) objectArray[i]);
}
}
for (int i : listTypeColumnIndices) {
if (!nullValueIndices.contains(i)) {
objectArray[i] = encodeList((Object[]) objectArray[i], columnArray[i]);
}
}
for (int i : mapTypeColumnIndices) {
if (!nullValueIndices.contains(i)) {
objectArray[i] = encodeMap((Map<Object, Object>) objectArray[i], columnArray[i]);
}
}
}
private void formatLocalDateTime(Object[] objectArray, int i, Column col, org.joda.time.LocalDateTime localDateTime) {
org.apache.sqoop.schema.type.DateTime column = (org.apache.sqoop.schema.type.DateTime) col;
@ -562,8 +588,7 @@ private boolean isColumnListType(Column listType) {
}
private boolean isColumnStringType(Column stringType) {
return stringType.getType().equals(ColumnType.TEXT)
|| stringType.getType().equals(ColumnType.ENUM);
return stringType.getType().equals(ColumnType.TEXT) || stringType.getType().equals(ColumnType.ENUM);
}
private String escapeByteArrays(byte[] bytes) {
@ -586,10 +611,6 @@ private String getRegExp(String orig) {
}
private String escapeString(String orig) {
if (orig == null) {
return NULL_STRING;
}
int j = 0;
String replacement = orig;
try {

View File

@ -32,6 +32,7 @@
import java.util.Map;
import java.util.Set;
import org.apache.commons.lang.StringUtils;
import org.apache.sqoop.common.SqoopException;
import org.apache.sqoop.schema.Schema;
import org.apache.sqoop.schema.type.Array;
@ -39,6 +40,7 @@
import org.apache.sqoop.schema.type.Bit;
import org.apache.sqoop.schema.type.Date;
import org.apache.sqoop.schema.type.DateTime;
import org.apache.sqoop.schema.type.Decimal;
import org.apache.sqoop.schema.type.FixedPoint;
import org.apache.sqoop.schema.type.Text;
import org.apache.sqoop.schema.type.Time;
@ -83,20 +85,108 @@ public void testNullInputAsCSVTextInObjectArrayOut() {
assertNull(out);
}
@Test(expected=SqoopException.class)
@Test(expected = SqoopException.class)
public void testEmptyInputAsCSVTextInObjectArrayOut() {
Schema schema = new Schema("test");
schema.addColumn(new FixedPoint("1"))
.addColumn(new FixedPoint("2"))
.addColumn(new Text("3"))
.addColumn(new Text("4"))
.addColumn(new Binary("5"))
.addColumn(new Text("6"));
schema.addColumn(new FixedPoint("1")).addColumn(new FixedPoint("2")).addColumn(new Text("3")).addColumn(new Text("4"))
.addColumn(new Binary("5")).addColumn(new Text("6"));
dataFormat.setSchema(schema);
dataFormat.setCSVTextData("");
dataFormat.getObjectData();
}
@Test
public void testNullValueAsObjectArrayInAndCSVTextOut() {
Schema schema = new Schema("test");
schema.addColumn(new FixedPoint("1")).addColumn(new Decimal("2")).addColumn(new Text("3"))
.addColumn(new Array("4", new Text("t"))).addColumn(new Binary("5"))
.addColumn(new org.apache.sqoop.schema.type.Map("6", new Text("t1"), new Text("t2"))).addColumn(new Bit("7"))
.addColumn(new org.apache.sqoop.schema.type.DateTime("8", false, false))
.addColumn(new org.apache.sqoop.schema.type.Time("9", false)).addColumn(new org.apache.sqoop.schema.type.Date("10"))
.addColumn(new org.apache.sqoop.schema.type.FloatingPoint("11"))
.addColumn(new org.apache.sqoop.schema.type.Set("12", new Text("t4")))
.addColumn(new org.apache.sqoop.schema.type.Enum("13")).addColumn(new org.apache.sqoop.schema.type.Unknown("14"));
dataFormat.setSchema(schema);
Object[] in = { null, null, null, null, null, null, null, null, null, null, null, null, null, null };
dataFormat.setObjectData(in);
String csvText = dataFormat.getCSVTextData();
String[] textValues = csvText.split(",");
for (String text : textValues) {
assertEquals(text, CSVIntermediateDataFormat.NULL_VALUE);
}
}
@Test
public void testNullValueAsObjectArrayInAndObjectArrayOut() {
Schema schema = new Schema("test");
schema.addColumn(new FixedPoint("1")).addColumn(new Decimal("2")).addColumn(new Text("3"))
.addColumn(new Array("4", new Text("t"))).addColumn(new Binary("5"))
.addColumn(new org.apache.sqoop.schema.type.Map("6", new Text("t1"), new Text("t2"))).addColumn(new Bit("7"))
.addColumn(new org.apache.sqoop.schema.type.DateTime("8", false, false))
.addColumn(new org.apache.sqoop.schema.type.Time("9", false)).addColumn(new org.apache.sqoop.schema.type.Date("10"))
.addColumn(new org.apache.sqoop.schema.type.FloatingPoint("11"))
.addColumn(new org.apache.sqoop.schema.type.Set("12", new Text("t4")))
.addColumn(new org.apache.sqoop.schema.type.Enum("13")).addColumn(new org.apache.sqoop.schema.type.Unknown("14"));
dataFormat.setSchema(schema);
Object[] in = { null, null, null, null, null, null, null, null, null, null, null, null, null, null };
dataFormat.setObjectData(in);
Object[] out = dataFormat.getObjectData();
for (Object obj : out) {
assertEquals(obj, null);
}
}
@Test
public void testNullValueAsCSVTextInAndObjectArrayOut() {
Schema schema = new Schema("test");
schema.addColumn(new FixedPoint("1")).addColumn(new Decimal("2")).addColumn(new Text("3"))
.addColumn(new Array("4", new Text("t"))).addColumn(new Binary("5"))
.addColumn(new org.apache.sqoop.schema.type.Map("6", new Text("t1"), new Text("t2"))).addColumn(new Bit("7"))
.addColumn(new org.apache.sqoop.schema.type.DateTime("8", false, false))
.addColumn(new org.apache.sqoop.schema.type.Time("9", false)).addColumn(new org.apache.sqoop.schema.type.Date("10"))
.addColumn(new org.apache.sqoop.schema.type.FloatingPoint("11"))
.addColumn(new org.apache.sqoop.schema.type.Set("12", new Text("t4")))
.addColumn(new org.apache.sqoop.schema.type.Enum("13")).addColumn(new org.apache.sqoop.schema.type.Unknown("14"));
dataFormat.setSchema(schema);
String[] test = { "NULL", "NULL", "NULL", "NULL", "NULL", "NULL", "NULL", "NULL", "NULL", "NULL", "NULL", "NULL", "NULL",
"NULL" };
dataFormat.setCSVTextData(StringUtils.join(test, ","));
Object[] out = dataFormat.getObjectData();
for (Object obj : out) {
assertEquals(obj, null);
}
}
@Test
public void testNullValueAsCSVTextInAndCSVTextOut() {
Schema schema = new Schema("test");
schema.addColumn(new FixedPoint("1")).addColumn(new Decimal("2")).addColumn(new Text("3"))
.addColumn(new Array("4", new Text("t"))).addColumn(new Binary("5"))
.addColumn(new org.apache.sqoop.schema.type.Map("6", new Text("t1"), new Text("t2"))).addColumn(new Bit("7"))
.addColumn(new org.apache.sqoop.schema.type.DateTime("8", false, false))
.addColumn(new org.apache.sqoop.schema.type.Time("9", false)).addColumn(new org.apache.sqoop.schema.type.Date("10"))
.addColumn(new org.apache.sqoop.schema.type.FloatingPoint("11"))
.addColumn(new org.apache.sqoop.schema.type.Set("12", new Text("t4")))
.addColumn(new org.apache.sqoop.schema.type.Enum("13")).addColumn(new org.apache.sqoop.schema.type.Unknown("14"));
dataFormat.setSchema(schema);
String[] test = { "NULL", "NULL", "NULL", "NULL", "NULL", "NULL", "NULL", "NULL", "NULL", "NULL", "NULL", "NULL", "NULL",
"NULL" };
dataFormat.setCSVTextData(StringUtils.join(test, ","));
String csvText = dataFormat.getCSVTextData();
String[] textValues = csvText.split(",");
for (String text : textValues) {
assertEquals(text, CSVIntermediateDataFormat.NULL_VALUE);
}
}
//**************test cases for primitive types( text, number, bytearray)*******************
@Test