diff --git a/hdfswriter/src/main/java/com/alibaba/datax/plugin/writer/hdfswriter/HdfsWriter.java b/hdfswriter/src/main/java/com/alibaba/datax/plugin/writer/hdfswriter/HdfsWriter.java index 0119be2b..853613a2 100644 --- a/hdfswriter/src/main/java/com/alibaba/datax/plugin/writer/hdfswriter/HdfsWriter.java +++ b/hdfswriter/src/main/java/com/alibaba/datax/plugin/writer/hdfswriter/HdfsWriter.java @@ -81,10 +81,10 @@ public class HdfsWriter extends Writer { //writeMode check this.writeMode = this.writerSliceConfig.getNecessaryValue(Key.WRITE_MODE, HdfsWriterErrorCode.REQUIRED_VALUE); writeMode = writeMode.toLowerCase().trim(); - Set supportedWriteModes = Sets.newHashSet("append", "nonconflict"); + Set supportedWriteModes = Sets.newHashSet("append", "nonconflict", "truncate"); if (!supportedWriteModes.contains(writeMode)) { throw DataXException.asDataXException(HdfsWriterErrorCode.ILLEGAL_VALUE, - String.format("仅支持append, nonConflict两种模式, 不支持您配置的 writeMode 模式 : [%s]", + String.format("仅支持append, nonConflict, truncate三种模式, 不支持您配置的 writeMode 模式 : [%s]", writeMode)); } this.writerSliceConfig.set(Key.WRITE_MODE, writeMode); @@ -179,6 +179,9 @@ public class HdfsWriter extends Writer { LOG.error(String.format("冲突文件列表为: [%s]", StringUtils.join(allFiles, ","))); throw DataXException.asDataXException(HdfsWriterErrorCode.ILLEGAL_VALUE, String.format("由于您配置了writeMode nonConflict,但您配置的path: [%s] 目录不为空, 下面存在其他文件或文件夹.", path)); + }else if ("truncate".equalsIgnoreCase(writeMode) && isExistFile) { + LOG.info(String.format("由于您配置了writeMode truncate, [%s] 下面的内容将被覆盖重写", path)); + hdfsHelper.deleteFiles(existFilePaths); } }else{ throw DataXException.asDataXException(HdfsWriterErrorCode.ILLEGAL_VALUE, diff --git a/plugin-rdbms-util/src/main/java/com/alibaba/datax/plugin/rdbms/reader/Constant.java b/plugin-rdbms-util/src/main/java/com/alibaba/datax/plugin/rdbms/reader/Constant.java index 729d71ac..f998357e 100755 --- a/plugin-rdbms-util/src/main/java/com/alibaba/datax/plugin/rdbms/reader/Constant.java +++ b/plugin-rdbms-util/src/main/java/com/alibaba/datax/plugin/rdbms/reader/Constant.java @@ -25,4 +25,6 @@ public final class Constant { public static String TABLE_NAME_PLACEHOLDER = "@table"; + public static Integer SPLIT_FACTOR = 5; + } diff --git a/plugin-rdbms-util/src/main/java/com/alibaba/datax/plugin/rdbms/reader/Key.java b/plugin-rdbms-util/src/main/java/com/alibaba/datax/plugin/rdbms/reader/Key.java index 63f8dde0..0e10c742 100755 --- a/plugin-rdbms-util/src/main/java/com/alibaba/datax/plugin/rdbms/reader/Key.java +++ b/plugin-rdbms-util/src/main/java/com/alibaba/datax/plugin/rdbms/reader/Key.java @@ -46,5 +46,6 @@ public final class Key { public final static String DRYRUN = "dryRun"; + public static String SPLIT_FACTOR = "splitFactor"; } \ No newline at end of file diff --git a/plugin-rdbms-util/src/main/java/com/alibaba/datax/plugin/rdbms/reader/util/ReaderSplitUtil.java b/plugin-rdbms-util/src/main/java/com/alibaba/datax/plugin/rdbms/reader/util/ReaderSplitUtil.java index 64109e90..ed48ff8c 100755 --- a/plugin-rdbms-util/src/main/java/com/alibaba/datax/plugin/rdbms/reader/util/ReaderSplitUtil.java +++ b/plugin-rdbms-util/src/main/java/com/alibaba/datax/plugin/rdbms/reader/util/ReaderSplitUtil.java @@ -68,7 +68,12 @@ public final class ReaderSplitUtil { //eachTableShouldSplittedNumber = eachTableShouldSplittedNumber * 2 + 1;// 不应该加1导致长尾 //考虑其他比率数字?(splitPk is null, 忽略此长尾) - eachTableShouldSplittedNumber = eachTableShouldSplittedNumber * 5; + //eachTableShouldSplittedNumber = eachTableShouldSplittedNumber * 5; + + //为避免导入hive小文件 默认基数为5,可以通过 splitFactor 配置基数 + // 最终task数为(channel/tableNum)向上取整*splitFactor + Integer splitFactor = originalSliceConfig.getInt(Key.SPLIT_FACTOR, Constant.SPLIT_FACTOR); + eachTableShouldSplittedNumber = eachTableShouldSplittedNumber * splitFactor; } // 尝试对每个表,切分为eachTableShouldSplittedNumber 份 for (String table : tables) {