5
0
mirror of https://github.com/apache/sqoop.git synced 2025-05-11 14:30:59 +08:00

SQOOP-2849: Sqoop2: Job failure when writing parquet in hdfs with data coming from mysql

(Abraham Fine via Jarek Jarcec Cecho)
This commit is contained in:
Jarek Jarcec Cecho 2016-02-29 11:37:06 -08:00
parent f9d7c3a8e5
commit e06190b2f8
3 changed files with 45 additions and 20 deletions

View File

@ -18,6 +18,7 @@
package org.apache.sqoop.connector.common; package org.apache.sqoop.connector.common;
import org.apache.avro.Schema; import org.apache.avro.Schema;
import org.apache.log4j.Logger;
import org.apache.sqoop.classification.InterfaceAudience; import org.apache.sqoop.classification.InterfaceAudience;
import org.apache.sqoop.classification.InterfaceStability; import org.apache.sqoop.classification.InterfaceStability;
import org.apache.sqoop.common.SqoopException; import org.apache.sqoop.common.SqoopException;
@ -36,6 +37,8 @@
@InterfaceStability.Unstable @InterfaceStability.Unstable
public class SqoopAvroUtils { public class SqoopAvroUtils {
private static final Logger LOG = Logger.getLogger(SqoopAvroUtils.class);
public static final String COLUMN_TYPE = "columnType"; public static final String COLUMN_TYPE = "columnType";
public static final String SQOOP_SCHEMA_NAMESPACE = "org.apache.sqoop"; public static final String SQOOP_SCHEMA_NAMESPACE = "org.apache.sqoop";
@ -44,14 +47,14 @@ public class SqoopAvroUtils {
*/ */
public static Schema createAvroSchema(org.apache.sqoop.schema.Schema sqoopSchema) { public static Schema createAvroSchema(org.apache.sqoop.schema.Schema sqoopSchema) {
// avro schema names cannot start with quotes, lets just remove them // avro schema names cannot start with quotes, lets just remove them
String name = sqoopSchema.getName().replace("\"", ""); String name = createAvroName(sqoopSchema.getName());
String doc = sqoopSchema.getNote(); String doc = sqoopSchema.getNote();
String namespace = SQOOP_SCHEMA_NAMESPACE; String namespace = SQOOP_SCHEMA_NAMESPACE;
Schema schema = Schema.createRecord(name, doc, namespace, false); Schema schema = Schema.createRecord(name, doc, namespace, false);
List<Schema.Field> fields = new ArrayList<Schema.Field>(); List<Schema.Field> fields = new ArrayList<Schema.Field>();
for (Column column : sqoopSchema.getColumnsArray()) { for (Column column : sqoopSchema.getColumnsArray()) {
Schema.Field field = new Schema.Field(column.getName(), createAvroFieldSchema(column), null, null); Schema.Field field = new Schema.Field(createAvroName(column.getName()), createAvroFieldSchema(column), null, null);
field.addProp(COLUMN_TYPE, column.getType().toString()); field.addProp(COLUMN_TYPE, column.getType().toString());
fields.add(field); fields.add(field);
} }
@ -59,6 +62,16 @@ public static Schema createAvroSchema(org.apache.sqoop.schema.Schema sqoopSchema
return schema; return schema;
} }
// From the avro docs:
// The name portion of a fullname, record field names, and enum symbols must:
// start with [A-Za-z_]
// subsequently contain only [A-Za-z0-9_]
public static String createAvroName(String name) {
String avroName = name.replaceFirst("^[0-9]", "").replaceAll("[^a-zA-Z0-9_]", "");
LOG.debug("Replacing name: " + name + " with Avro name: " + avroName);
return avroName;
}
public static Schema createAvroFieldSchema(Column column) { public static Schema createAvroFieldSchema(Column column) {
Schema schema = toAvroFieldType(column); Schema schema = toAvroFieldType(column);
if (!column.isNullable()) { if (!column.isNullable()) {
@ -123,7 +136,7 @@ public static Schema createEnumSchema(Column column) {
assert column instanceof org.apache.sqoop.schema.type.Enum; assert column instanceof org.apache.sqoop.schema.type.Enum;
Set<String> options = ((org.apache.sqoop.schema.type.Enum) column).getOptions(); Set<String> options = ((org.apache.sqoop.schema.type.Enum) column).getOptions();
List<String> listOptions = new ArrayList<String>(options); List<String> listOptions = new ArrayList<String>(options);
return Schema.createEnum(column.getName(), null, SQOOP_SCHEMA_NAMESPACE, listOptions); return Schema.createEnum(createAvroName(column.getName()), null, SQOOP_SCHEMA_NAMESPACE, listOptions);
} }
public static byte[] getBytesFromByteBuffer(Object obj) { public static byte[] getBytesFromByteBuffer(Object obj) {

View File

@ -36,6 +36,7 @@
import org.apache.sqoop.classification.InterfaceAudience; import org.apache.sqoop.classification.InterfaceAudience;
import org.apache.sqoop.classification.InterfaceStability; import org.apache.sqoop.classification.InterfaceStability;
import org.apache.sqoop.common.SqoopException; import org.apache.sqoop.common.SqoopException;
import org.apache.sqoop.connector.common.SqoopAvroUtils;
import org.apache.sqoop.error.code.IntermediateDataFormatError; import org.apache.sqoop.error.code.IntermediateDataFormatError;
import org.apache.sqoop.schema.type.Column; import org.apache.sqoop.schema.type.Column;
import org.apache.sqoop.utils.ClassUtils; import org.apache.sqoop.utils.ClassUtils;
@ -166,11 +167,12 @@ public GenericRecord toAVRO(String csv) {
throw new SqoopException(IntermediateDataFormatError.INTERMEDIATE_DATA_FORMAT_0005, throw new SqoopException(IntermediateDataFormatError.INTERMEDIATE_DATA_FORMAT_0005,
columns[i].getName() + " does not support null values"); columns[i].getName() + " does not support null values");
} }
String name = SqoopAvroUtils.createAvroName(columns[i].getName());
if (csvStringArray[i].equals(DEFAULT_NULL_VALUE)) { if (csvStringArray[i].equals(DEFAULT_NULL_VALUE)) {
avroObject.put(columns[i].getName(), null); avroObject.put(name, null);
continue; continue;
} }
avroObject.put(columns[i].getName(), toAVRO(csvStringArray[i], columns[i])); avroObject.put(name, toAVRO(csvStringArray[i], columns[i]));
} }
return avroObject; return avroObject;
} }
@ -250,56 +252,59 @@ public GenericRecord toAVRO(Object[] objectArray) {
throw new SqoopException(IntermediateDataFormatError.INTERMEDIATE_DATA_FORMAT_0005, throw new SqoopException(IntermediateDataFormatError.INTERMEDIATE_DATA_FORMAT_0005,
columns[i].getName() + " does not support null values"); columns[i].getName() + " does not support null values");
} }
String name = SqoopAvroUtils.createAvroName(columns[i].getName());
if (objectArray[i] == null) { if (objectArray[i] == null) {
avroObject.put(columns[i].getName(), null); avroObject.put(name, null);
continue; continue;
} }
switch (columns[i].getType()) { switch (columns[i].getType()) {
case ARRAY: case ARRAY:
case SET: case SET:
avroObject.put(columns[i].getName(), toList((Object[]) objectArray[i])); avroObject.put(name, toList((Object[]) objectArray[i]));
break; break;
case ENUM: case ENUM:
GenericData.EnumSymbol enumValue = new GenericData.EnumSymbol(createEnumSchema(columns[i]), GenericData.EnumSymbol enumValue = new GenericData.EnumSymbol(createEnumSchema(columns[i]),
(String) objectArray[i]); (String) objectArray[i]);
avroObject.put(columns[i].getName(), enumValue); avroObject.put(name, enumValue);
break; break;
case TEXT: case TEXT:
avroObject.put(columns[i].getName(), new Utf8((String) objectArray[i])); avroObject.put(name, new Utf8((String) objectArray[i]));
break; break;
case BINARY: case BINARY:
case UNKNOWN: case UNKNOWN:
avroObject.put(columns[i].getName(), ByteBuffer.wrap((byte[]) objectArray[i])); avroObject.put(name, ByteBuffer.wrap((byte[]) objectArray[i]));
break; break;
case MAP: case MAP:
case FIXED_POINT: case FIXED_POINT:
case FLOATING_POINT: case FLOATING_POINT:
avroObject.put(columns[i].getName(), objectArray[i]); avroObject.put(name, objectArray[i]);
break; break;
case DECIMAL: case DECIMAL:
// TODO: store as FIXED in SQOOP-16161 // TODO: store as FIXED in SQOOP-16161
avroObject.put(columns[i].getName(), ((BigDecimal) objectArray[i]).toPlainString()); avroObject.put(name, ((BigDecimal) objectArray[i]).toPlainString());
break; break;
case DATE_TIME: case DATE_TIME:
if (objectArray[i] instanceof org.joda.time.DateTime) { if (objectArray[i] instanceof org.joda.time.DateTime) {
avroObject.put(columns[i].getName(), ((org.joda.time.DateTime) objectArray[i]).toDate() avroObject.put(name, ((org.joda.time.DateTime) objectArray[i]).toDate()
.getTime()); .getTime());
} else if (objectArray[i] instanceof org.joda.time.LocalDateTime) { } else if (objectArray[i] instanceof org.joda.time.LocalDateTime) {
avroObject.put(columns[i].getName(), ((org.joda.time.LocalDateTime) objectArray[i]) avroObject.put(name, ((org.joda.time.LocalDateTime) objectArray[i])
.toDate().getTime()); .toDate().getTime());
} }
break; break;
case TIME: case TIME:
avroObject.put(columns[i].getName(), ((org.joda.time.LocalTime) objectArray[i]) avroObject.put(name, ((org.joda.time.LocalTime) objectArray[i])
.toDateTimeToday().getMillis()); .toDateTimeToday().getMillis());
break; break;
case DATE: case DATE:
avroObject.put(columns[i].getName(), ((org.joda.time.LocalDate) objectArray[i]).toDate() avroObject.put(name, ((org.joda.time.LocalDate) objectArray[i]).toDate()
.getTime()); .getTime());
break; break;
case BIT: case BIT:
avroObject.put(columns[i].getName(), Boolean.valueOf(objectArray[i].toString())); avroObject.put(name, Boolean.valueOf(objectArray[i].toString()));
break; break;
default: default:
throw new SqoopException(IntermediateDataFormatError.INTERMEDIATE_DATA_FORMAT_0001, throw new SqoopException(IntermediateDataFormatError.INTERMEDIATE_DATA_FORMAT_0001,
@ -317,7 +322,7 @@ public String toCSV(GenericRecord record) {
StringBuilder csvString = new StringBuilder(); StringBuilder csvString = new StringBuilder();
for (int i = 0; i < columns.length; i++) { for (int i = 0; i < columns.length; i++) {
Object obj = record.get(columns[i].getName()); Object obj = record.get(SqoopAvroUtils.createAvroName(columns[i].getName()));
if (obj == null && !columns[i].isNullable()) { if (obj == null && !columns[i].isNullable()) {
throw new SqoopException(IntermediateDataFormatError.INTERMEDIATE_DATA_FORMAT_0005, throw new SqoopException(IntermediateDataFormatError.INTERMEDIATE_DATA_FORMAT_0005,
columns[i].getName() + " does not support null values"); columns[i].getName() + " does not support null values");
@ -396,8 +401,8 @@ public Object[] toObject(GenericRecord record) {
Object[] object = new Object[columns.length]; Object[] object = new Object[columns.length];
for (int i = 0; i < columns.length; i++) { for (int i = 0; i < columns.length; i++) {
Object obj = record.get(columns[i].getName()); Object obj = record.get(SqoopAvroUtils.createAvroName(columns[i].getName()));
Integer nameIndex = schema.getColumnNameIndex(columns[i].getName()); Integer nameIndex = schema.getColumnNameIndex(SqoopAvroUtils.createAvroName(columns[i].getName()));
Column column = columns[nameIndex]; Column column = columns[nameIndex];
// null is a possible value // null is a possible value
if (obj == null && !column.isNullable()) { if (obj == null && !column.isNullable()) {

View File

@ -39,6 +39,7 @@
import org.apache.sqoop.schema.type.FixedPoint; import org.apache.sqoop.schema.type.FixedPoint;
import org.apache.sqoop.schema.type.Text; import org.apache.sqoop.schema.type.Text;
import org.joda.time.LocalDateTime; import org.joda.time.LocalDateTime;
import org.testng.Assert;
import org.testng.annotations.BeforeMethod; import org.testng.annotations.BeforeMethod;
import org.testng.annotations.Test; import org.testng.annotations.Test;
@ -545,4 +546,10 @@ public void testSchemaNotNullableWithAvro() {
dataFormat.getData(); dataFormat.getData();
} }
@Test
public void testSchemaWithBadCharacters() {
Schema schema = new Schema("9`\" blah`^&*(^&*(%$^&").addColumn(new Text("one").setNullable(false));
AVROIntermediateDataFormat dataFormat = new AVROIntermediateDataFormat(schema);
Assert.assertEquals(dataFormat.getAvroSchema().getName(), "blah");
}
} }