SQOOP-3075: Simplify Unicode character support in

source files (introduced by SQOOP-3074) by defining explicit locales instead of using EscapeUtils (Attila Szabo)
2025-05-03 07:42:00 +08:00 · 2016-12-16 11:48:52 +01:00 · 2016-12-16 11:48:52 +01:00 · be30a344ee
commit be30a344ee
parent 5771a2da5f
4 changed files with 9 additions and 20 deletions
--- a/src/java/org/apache/sqoop/avro/AvroUtil.java
+++ b/src/java/org/apache/sqoop/avro/AvroUtil.java
@ -28,7 +28,6 @@
 import org.apache.avro.generic.GenericRecord;
 import org.apache.avro.io.DatumReader;
 import org.apache.avro.mapred.FsInput;
-import org.apache.commons.lang.StringEscapeUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
@ -107,10 +106,7 @@ public static Object toAvro(Object o, Schema.Field field, boolean bigDecimalForm
   * Convert Column name into Avro column name.
   */
  public static String toAvroColumn(String column) {
-    // We're unescaping identifiers to get the real Unicode characters
-    // back, and not the escaped versions.
-    String candidate = StringEscapeUtils.unescapeJava(
-        ClassWriter.toJavaIdentifier(column));
+    String candidate = ClassWriter.toJavaIdentifier(column);
    return toAvroIdentifier(candidate);
  }

--- a/src/java/org/apache/sqoop/orm/AvroSchemaGenerator.java
+++ b/src/java/org/apache/sqoop/orm/AvroSchemaGenerator.java
@ -29,7 +29,6 @@
 import org.apache.avro.Schema;
 import org.apache.avro.Schema.Field;
 import org.apache.avro.Schema.Type;
-import org.apache.commons.lang.StringEscapeUtils;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;

@ -89,9 +88,7 @@ public Schema generate(String schemaNameOverride) throws IOException {

    List<Field> fields = new ArrayList<Field>();
    for (String columnName : columnNames) {
-      // We're unescaping identifiers to get the real Unicode characters
-      // back, and not the escaped versions.
-      String cleanedCol = AvroUtil.toAvroIdentifier(StringEscapeUtils.unescapeJava(ClassWriter.toJavaIdentifier(columnName)));
+      String cleanedCol = AvroUtil.toAvroIdentifier(ClassWriter.toJavaIdentifier(columnName));
      List<Integer> columnInfoList = columnInfo.get(columnName);
      int sqlType = columnInfoList.get(0);
      Integer precision = columnInfoList.get(1);
--- a/src/java/org/apache/sqoop/orm/ClassWriter.java
+++ b/src/java/org/apache/sqoop/orm/ClassWriter.java
@ -24,6 +24,7 @@
 import java.io.OutputStream;
 import java.io.OutputStreamWriter;
 import java.io.Writer;
+import java.nio.charset.StandardCharsets;
 import java.util.Date;
 import java.util.HashSet;
 import java.util.List;
@ -284,16 +285,7 @@ public static String toJavaIdentifier(String candidate) {
      return "_" + output;
    }

-    // Calling StringEscapeUtils#escapeJava is required because we'd like to
-    // support Unicode characters in identifiers even if the locale of the host
-    // system is not supporting UTF-8, or by any reason the locale is different
-    // from that. Good example: if a column name would contain a \uC3A1 char
-    // in it's name, though the locale would not support Unicode characters
-    // then the generated java file would contain unrecognizable characters
-    // for the compiler, and javac would fail with a compile error. If the name
-    // of the column would be Alm\uC3A1a then it would be Alm\uC3A1a after the
-    // escaping, and this every places where it's used/
-    return StringEscapeUtils.escapeJava(output);
+    return output;
  }

  private String toJavaType(String columnName, int sqlType) {
@ -1796,7 +1788,7 @@ public void generate() throws IOException {
    Writer writer = null;
    try {
      ostream = new FileOutputStream(filename);
-      writer = new OutputStreamWriter(ostream);
+      writer = new OutputStreamWriter(ostream, StandardCharsets.UTF_8);
      writer.append(sb.toString());
    } finally {
      if (null != writer) {
--- a/src/java/org/apache/sqoop/orm/CompilationManager.java
+++ b/src/java/org/apache/sqoop/orm/CompilationManager.java
@ -23,6 +23,7 @@
 import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.OutputStream;
+import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.Iterator;
 import java.util.List;
@ -170,6 +171,9 @@ public void compile() throws IOException {
    String curClasspath = System.getProperty("java.class.path");
    LOG.debug("Current sqoop classpath = " + curClasspath);

+    args.add("-encoding");
+    args.add(StandardCharsets.UTF_8.toString());
+
    args.add("-sourcepath");
    args.add(jarOutDir);