mirror of
https://github.com/apache/sqoop.git
synced 2025-05-03 20:40:58 +08:00
SQOOP-3075: Simplify Unicode character support in
source files (introduced by SQOOP-3074) by defining explicit locales instead of using EscapeUtils (Attila Szabo)
This commit is contained in:
parent
5771a2da5f
commit
be30a344ee
@ -28,7 +28,6 @@
|
|||||||
import org.apache.avro.generic.GenericRecord;
|
import org.apache.avro.generic.GenericRecord;
|
||||||
import org.apache.avro.io.DatumReader;
|
import org.apache.avro.io.DatumReader;
|
||||||
import org.apache.avro.mapred.FsInput;
|
import org.apache.avro.mapred.FsInput;
|
||||||
import org.apache.commons.lang.StringEscapeUtils;
|
|
||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
import org.apache.hadoop.fs.FileStatus;
|
import org.apache.hadoop.fs.FileStatus;
|
||||||
import org.apache.hadoop.fs.FileSystem;
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
@ -107,10 +106,7 @@ public static Object toAvro(Object o, Schema.Field field, boolean bigDecimalForm
|
|||||||
* Convert Column name into Avro column name.
|
* Convert Column name into Avro column name.
|
||||||
*/
|
*/
|
||||||
public static String toAvroColumn(String column) {
|
public static String toAvroColumn(String column) {
|
||||||
// We're unescaping identifiers to get the real Unicode characters
|
String candidate = ClassWriter.toJavaIdentifier(column);
|
||||||
// back, and not the escaped versions.
|
|
||||||
String candidate = StringEscapeUtils.unescapeJava(
|
|
||||||
ClassWriter.toJavaIdentifier(column));
|
|
||||||
return toAvroIdentifier(candidate);
|
return toAvroIdentifier(candidate);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -29,7 +29,6 @@
|
|||||||
import org.apache.avro.Schema;
|
import org.apache.avro.Schema;
|
||||||
import org.apache.avro.Schema.Field;
|
import org.apache.avro.Schema.Field;
|
||||||
import org.apache.avro.Schema.Type;
|
import org.apache.avro.Schema.Type;
|
||||||
import org.apache.commons.lang.StringEscapeUtils;
|
|
||||||
import org.apache.commons.logging.Log;
|
import org.apache.commons.logging.Log;
|
||||||
import org.apache.commons.logging.LogFactory;
|
import org.apache.commons.logging.LogFactory;
|
||||||
|
|
||||||
@ -89,9 +88,7 @@ public Schema generate(String schemaNameOverride) throws IOException {
|
|||||||
|
|
||||||
List<Field> fields = new ArrayList<Field>();
|
List<Field> fields = new ArrayList<Field>();
|
||||||
for (String columnName : columnNames) {
|
for (String columnName : columnNames) {
|
||||||
// We're unescaping identifiers to get the real Unicode characters
|
String cleanedCol = AvroUtil.toAvroIdentifier(ClassWriter.toJavaIdentifier(columnName));
|
||||||
// back, and not the escaped versions.
|
|
||||||
String cleanedCol = AvroUtil.toAvroIdentifier(StringEscapeUtils.unescapeJava(ClassWriter.toJavaIdentifier(columnName)));
|
|
||||||
List<Integer> columnInfoList = columnInfo.get(columnName);
|
List<Integer> columnInfoList = columnInfo.get(columnName);
|
||||||
int sqlType = columnInfoList.get(0);
|
int sqlType = columnInfoList.get(0);
|
||||||
Integer precision = columnInfoList.get(1);
|
Integer precision = columnInfoList.get(1);
|
||||||
|
@ -24,6 +24,7 @@
|
|||||||
import java.io.OutputStream;
|
import java.io.OutputStream;
|
||||||
import java.io.OutputStreamWriter;
|
import java.io.OutputStreamWriter;
|
||||||
import java.io.Writer;
|
import java.io.Writer;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.util.Date;
|
import java.util.Date;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
@ -284,16 +285,7 @@ public static String toJavaIdentifier(String candidate) {
|
|||||||
return "_" + output;
|
return "_" + output;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Calling StringEscapeUtils#escapeJava is required because we'd like to
|
return output;
|
||||||
// support Unicode characters in identifiers even if the locale of the host
|
|
||||||
// system is not supporting UTF-8, or by any reason the locale is different
|
|
||||||
// from that. Good example: if a column name would contain a \uC3A1 char
|
|
||||||
// in it's name, though the locale would not support Unicode characters
|
|
||||||
// then the generated java file would contain unrecognizable characters
|
|
||||||
// for the compiler, and javac would fail with a compile error. If the name
|
|
||||||
// of the column would be Alm\uC3A1a then it would be Alm\uC3A1a after the
|
|
||||||
// escaping, and this every places where it's used/
|
|
||||||
return StringEscapeUtils.escapeJava(output);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private String toJavaType(String columnName, int sqlType) {
|
private String toJavaType(String columnName, int sqlType) {
|
||||||
@ -1796,7 +1788,7 @@ public void generate() throws IOException {
|
|||||||
Writer writer = null;
|
Writer writer = null;
|
||||||
try {
|
try {
|
||||||
ostream = new FileOutputStream(filename);
|
ostream = new FileOutputStream(filename);
|
||||||
writer = new OutputStreamWriter(ostream);
|
writer = new OutputStreamWriter(ostream, StandardCharsets.UTF_8);
|
||||||
writer.append(sb.toString());
|
writer.append(sb.toString());
|
||||||
} finally {
|
} finally {
|
||||||
if (null != writer) {
|
if (null != writer) {
|
||||||
|
@ -23,6 +23,7 @@
|
|||||||
import java.io.FileOutputStream;
|
import java.io.FileOutputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.OutputStream;
|
import java.io.OutputStream;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
@ -170,6 +171,9 @@ public void compile() throws IOException {
|
|||||||
String curClasspath = System.getProperty("java.class.path");
|
String curClasspath = System.getProperty("java.class.path");
|
||||||
LOG.debug("Current sqoop classpath = " + curClasspath);
|
LOG.debug("Current sqoop classpath = " + curClasspath);
|
||||||
|
|
||||||
|
args.add("-encoding");
|
||||||
|
args.add(StandardCharsets.UTF_8.toString());
|
||||||
|
|
||||||
args.add("-sourcepath");
|
args.add("-sourcepath");
|
||||||
args.add(jarOutDir);
|
args.add(jarOutDir);
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user