5
0
mirror of https://github.com/apache/sqoop.git synced 2025-05-03 20:40:58 +08:00

SQOOP-3075: Simplify Unicode character support in

source files (introduced by SQOOP-3074) by
defining explicit locales instead of using
EscapeUtils

(Attila Szabo)
This commit is contained in:
Attila Szabo 2016-12-16 11:48:52 +01:00
parent 5771a2da5f
commit be30a344ee
4 changed files with 9 additions and 20 deletions

View File

@ -28,7 +28,6 @@
import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.GenericRecord;
import org.apache.avro.io.DatumReader; import org.apache.avro.io.DatumReader;
import org.apache.avro.mapred.FsInput; import org.apache.avro.mapred.FsInput;
import org.apache.commons.lang.StringEscapeUtils;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
@ -107,10 +106,7 @@ public static Object toAvro(Object o, Schema.Field field, boolean bigDecimalForm
* Convert Column name into Avro column name. * Convert Column name into Avro column name.
*/ */
public static String toAvroColumn(String column) { public static String toAvroColumn(String column) {
// We're unescaping identifiers to get the real Unicode characters String candidate = ClassWriter.toJavaIdentifier(column);
// back, and not the escaped versions.
String candidate = StringEscapeUtils.unescapeJava(
ClassWriter.toJavaIdentifier(column));
return toAvroIdentifier(candidate); return toAvroIdentifier(candidate);
} }

View File

@ -29,7 +29,6 @@
import org.apache.avro.Schema; import org.apache.avro.Schema;
import org.apache.avro.Schema.Field; import org.apache.avro.Schema.Field;
import org.apache.avro.Schema.Type; import org.apache.avro.Schema.Type;
import org.apache.commons.lang.StringEscapeUtils;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
@ -89,9 +88,7 @@ public Schema generate(String schemaNameOverride) throws IOException {
List<Field> fields = new ArrayList<Field>(); List<Field> fields = new ArrayList<Field>();
for (String columnName : columnNames) { for (String columnName : columnNames) {
// We're unescaping identifiers to get the real Unicode characters String cleanedCol = AvroUtil.toAvroIdentifier(ClassWriter.toJavaIdentifier(columnName));
// back, and not the escaped versions.
String cleanedCol = AvroUtil.toAvroIdentifier(StringEscapeUtils.unescapeJava(ClassWriter.toJavaIdentifier(columnName)));
List<Integer> columnInfoList = columnInfo.get(columnName); List<Integer> columnInfoList = columnInfo.get(columnName);
int sqlType = columnInfoList.get(0); int sqlType = columnInfoList.get(0);
Integer precision = columnInfoList.get(1); Integer precision = columnInfoList.get(1);

View File

@ -24,6 +24,7 @@
import java.io.OutputStream; import java.io.OutputStream;
import java.io.OutputStreamWriter; import java.io.OutputStreamWriter;
import java.io.Writer; import java.io.Writer;
import java.nio.charset.StandardCharsets;
import java.util.Date; import java.util.Date;
import java.util.HashSet; import java.util.HashSet;
import java.util.List; import java.util.List;
@ -284,16 +285,7 @@ public static String toJavaIdentifier(String candidate) {
return "_" + output; return "_" + output;
} }
// Calling StringEscapeUtils#escapeJava is required because we'd like to return output;
// support Unicode characters in identifiers even if the locale of the host
// system is not supporting UTF-8, or by any reason the locale is different
// from that. Good example: if a column name would contain a \uC3A1 char
// in it's name, though the locale would not support Unicode characters
// then the generated java file would contain unrecognizable characters
// for the compiler, and javac would fail with a compile error. If the name
// of the column would be Alm\uC3A1a then it would be Alm\uC3A1a after the
// escaping, and this every places where it's used/
return StringEscapeUtils.escapeJava(output);
} }
private String toJavaType(String columnName, int sqlType) { private String toJavaType(String columnName, int sqlType) {
@ -1796,7 +1788,7 @@ public void generate() throws IOException {
Writer writer = null; Writer writer = null;
try { try {
ostream = new FileOutputStream(filename); ostream = new FileOutputStream(filename);
writer = new OutputStreamWriter(ostream); writer = new OutputStreamWriter(ostream, StandardCharsets.UTF_8);
writer.append(sb.toString()); writer.append(sb.toString());
} finally { } finally {
if (null != writer) { if (null != writer) {

View File

@ -23,6 +23,7 @@
import java.io.FileOutputStream; import java.io.FileOutputStream;
import java.io.IOException; import java.io.IOException;
import java.io.OutputStream; import java.io.OutputStream;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Iterator; import java.util.Iterator;
import java.util.List; import java.util.List;
@ -170,6 +171,9 @@ public void compile() throws IOException {
String curClasspath = System.getProperty("java.class.path"); String curClasspath = System.getProperty("java.class.path");
LOG.debug("Current sqoop classpath = " + curClasspath); LOG.debug("Current sqoop classpath = " + curClasspath);
args.add("-encoding");
args.add(StandardCharsets.UTF_8.toString());
args.add("-sourcepath"); args.add("-sourcepath");
args.add(jarOutDir); args.add(jarOutDir);