Export-从HDFS导到DBMS，支持配置读取文件编码，使用方法：--fileencoding gbk

2025-05-02 20:39:58 +08:00 · 2019-11-19 08:51:23 +08:00 · 2019-11-19 08:51:23 +08:00 · e0538e2bb1
commit e0538e2bb1
parent 912fbc1c9c
4 changed files with 40 additions and 5 deletions
--- a/src/java/org/apache/sqoop/mapreduce/TextExportMapper.java
+++ b/src/java/org/apache/sqoop/mapreduce/TextExportMapper.java
@ -31,6 +31,8 @@
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;

+import static org.apache.sqoop.tool.BaseSqoopTool.ENCODE;
+
 /**
 * Converts an input record from a string representation to a parsed Sqoop
 * record and emits that DBWritable to the OutputFormat for writeback to the
@ -44,6 +46,7 @@ public class TextExportMapper
  public static final Log LOG =
    LogFactory.getLog(TextExportMapper.class.getName());

+  private String encoding;
  private SqoopRecord recordImpl;

  boolean enableDataDumpOnError;
@ -80,13 +83,21 @@ protected void setup(Context context)
    }

    enableDataDumpOnError = conf.getBoolean(DUMP_DATA_ON_ERROR_KEY, false);
+
+    encoding = conf.get(ENCODE);
  }


  public void map(LongWritable key, Text val, Context context)
      throws IOException, InterruptedException {
    try {
-      recordImpl.parse(val);
+      // 据说转码比较消耗性能
+      if (encoding != null) {
+        String newValue = new String(val.getBytes(), 0, val.getLength(), encoding);
+        recordImpl.parse(newValue);
+      } else {
+        recordImpl.parse(val);
+      }
      context.write(recordImpl, NullWritable.get());
    } catch (Exception e) {
      // Something bad has happened
--- a/src/java/org/apache/sqoop/tool/BaseSqoopTool.java
+++ b/src/java/org/apache/sqoop/tool/BaseSqoopTool.java
@ -174,6 +174,8 @@ public abstract class BaseSqoopTool extends com.cloudera.sqoop.tool.SqoopTool {
  public static final String THROW_ON_ERROR_ARG = "throw-on-error";
  public static final String ORACLE_ESCAPING_DISABLED = "oracle-escaping-disabled";
  public static final String ESCAPE_MAPPING_COLUMN_NAMES_ENABLED = "escape-mapping-column-names";
+  public static final String FILE_ENCODING = "fileencoding";//文件编码
+  public static final String ENCODE = "sqoop.mapreduce.export.encode";

  // Arguments for validation.
  public static final String VALIDATE_ARG = "validate";
--- a/src/java/org/apache/sqoop/tool/ExportTool.java
+++ b/src/java/org/apache/sqoop/tool/ExportTool.java
@ -208,6 +208,23 @@ public void configureOptions(ToolOptions toolOptions) {

    toolOptions.addUniqueOptions(codeGenOpts);
    toolOptions.addUniqueOptions(getHCatalogOptions());
+
+    toolOptions.addUniqueOptions(getFileencodingOptions());
+  }
+
+  /**
+   * 文件编码
+   *
+   * @return
+   */
+  protected RelatedOptions getFileencodingOptions() {
+    RelatedOptions fileencodingOptions = new RelatedOptions("fileencoding arguments");
+    fileencodingOptions.addOption(OptionBuilder
+            .hasArg()
+            .withDescription("fileencoding")
+            .withLongOpt("fileencoding")
+            .create());
+    return fileencodingOptions;
  }

  @Override
@ -279,6 +296,11 @@ public void applyOptions(CommandLine in, SqoopOptions out)
          out.setCall(in.getOptionValue(CALL_ARG));
      }

+      //设置文件编码
+      if (in.hasOption(FILE_ENCODING)) {
+          out.getConf().set(ENCODE, in.getOptionValue(FILE_ENCODING));
+      }
+
      applyValidationOptions(in, out);
      applyNewUpdateOptions(in, out);
      applyInputFormatOptions(in, out);
--- a/src/test/org/apache/sqoop/mapreduce/db/DateSplitterTest.java
+++ b/src/test/org/apache/sqoop/mapreduce/db/DateSplitterTest.java
@ -24,13 +24,13 @@ public void split() throws Exception {
        long maxVal;

        int sqlDataType = Types.TIMESTAMP;
-        minVal = df.parse("2019-04-22 11:28:30").getTime();
-        maxVal = df.parse("2019-04-22 16:28:30").getTime();
+        minVal = df.parse("2019-04-22 00:00:00").getTime();
+        maxVal = df.parse("2019-04-22 23:59:59").getTime();

        String lowClausePrefix = colName + " >= ";
        String highClausePrefix = colName + " < ";

-        int numSplits = 2;
+        int numSplits = 1440;
        if (numSplits < 1) {
            numSplits = 1;
        }
@ -45,7 +45,7 @@ public void split() throws Exception {
        }

        // For split size we are using seconds. So we need to convert to milliseconds.
-        long splitLimit = 3600 * MS_IN_SEC;
+        long splitLimit = -1 * MS_IN_SEC;

        // Gather the split point integers
        List<Long> splitPoints = dateSplitter.split(numSplits, splitLimit, minVal, maxVal);