mirror of
https://github.com/alibaba/DataX.git
synced 2025-05-02 03:39:19 +08:00
Merge branch 'master' into featureFor1780
This commit is contained in:
commit
260320b89d
21
README.md
21
README.md
@ -26,7 +26,7 @@ DataX本身作为数据同步框架,将不同数据源的同步抽象为从源
|
||||
|
||||
# Quick Start
|
||||
|
||||
##### Download [DataX下载地址](https://datax-opensource.oss-cn-hangzhou.aliyuncs.com/202303/datax.tar.gz)
|
||||
##### Download [DataX下载地址](https://datax-opensource.oss-cn-hangzhou.aliyuncs.com/202308/datax.tar.gz)
|
||||
|
||||
|
||||
##### 请点击:[Quick Start](https://github.com/alibaba/DataX/blob/master/userGuid.md)
|
||||
@ -100,7 +100,7 @@ DataX目前已经有了比较全面的插件体系,主流的RDBMS数据库、N
|
||||
- 整库迁移:https://help.aliyun.com/document_detail/137809.html
|
||||
- 批量上云:https://help.aliyun.com/document_detail/146671.html
|
||||
- 更新更多能力请访问:https://help.aliyun.com/document_detail/137663.html
|
||||
|
||||
-
|
||||
|
||||
# 我要开发新的插件
|
||||
|
||||
@ -109,6 +109,23 @@ DataX目前已经有了比较全面的插件体系,主流的RDBMS数据库、N
|
||||
# 重要版本更新说明
|
||||
|
||||
DataX 后续计划月度迭代更新,也欢迎感兴趣的同学提交 Pull requests,月度更新内容会介绍介绍如下。
|
||||
|
||||
- [datax_v202309](https://github.com/alibaba/DataX/releases/tag/datax_v202309)
|
||||
- 支持Phoenix 同步数据添加 where条件
|
||||
- 支持华为 GuassDB读写插件
|
||||
- 修复ClickReader 插件运行报错 Can't find bundle for base name
|
||||
- 增加 DataX调试模块
|
||||
- 修复 orc空文件报错问题
|
||||
- 优化obwriter性能
|
||||
- txtfilewriter 增加导出为insert语句功能支持
|
||||
- HdfsReader/HdfsWriter 支持parquet读写能力
|
||||
|
||||
- [datax_v202308](https://github.com/alibaba/DataX/releases/tag/datax_v202308)
|
||||
- OTS 插件更新
|
||||
- databend 插件更新
|
||||
- Oceanbase驱动修复
|
||||
|
||||
|
||||
- [datax_v202306](https://github.com/alibaba/DataX/releases/tag/datax_v202306)
|
||||
- 精简代码
|
||||
- 新增插件(neo4jwriter、clickhousewriter)
|
||||
|
@ -2,7 +2,7 @@
|
||||
xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0 http://maven.apache.org/xsd/assembly-1.1.0.xsd">
|
||||
<id>datax</id>
|
||||
<id></id>
|
||||
<formats>
|
||||
<format>dir</format>
|
||||
</formats>
|
||||
|
@ -27,8 +27,6 @@ public class ClickhouseReader extends Reader {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(ClickhouseReader.class);
|
||||
|
||||
public static class Job extends Reader.Job {
|
||||
private static MessageSource MESSAGE_SOURCE = MessageSource.loadResourceBundle(ClickhouseReader.class);
|
||||
|
||||
private Configuration jobConfig = null;
|
||||
private CommonRdbmsReader.Job commonRdbmsReaderMaster;
|
||||
|
||||
|
@ -1,74 +0,0 @@
|
||||
package com.alibaba.datax.plugin.reader.clickhousereader;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.alibaba.datax.common.element.Column;
|
||||
import com.alibaba.datax.common.element.Record;
|
||||
import com.alibaba.datax.common.util.Configuration;
|
||||
import com.alibaba.datax.dataxservice.face.eventcenter.EventLogStore;
|
||||
import com.alibaba.datax.dataxservice.face.eventcenter.RuntimeContext;
|
||||
import com.alibaba.datax.test.simulator.BasicReaderPluginTest;
|
||||
import com.alibaba.datax.test.simulator.junit.extend.log.LoggedRunner;
|
||||
import com.alibaba.datax.test.simulator.junit.extend.log.TestLogger;
|
||||
import com.alibaba.fastjson.JSON;
|
||||
|
||||
import org.apache.commons.lang3.ArrayUtils;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
import org.junit.runner.RunWith;
|
||||
|
||||
|
||||
@RunWith(LoggedRunner.class)
|
||||
@Ignore
|
||||
public class ClickhouseReaderTest extends BasicReaderPluginTest {
|
||||
@TestLogger(log = "测试basic1.json. 配置常量.")
|
||||
@Test
|
||||
public void testBasic1() {
|
||||
RuntimeContext.setGlobalJobId(-1);
|
||||
EventLogStore.init();
|
||||
List<Record> noteRecordForTest = new ArrayList<Record>();
|
||||
|
||||
List<Configuration> subjobs = super.doReaderTest("basic1.json", 1, noteRecordForTest);
|
||||
|
||||
Assert.assertEquals(1, subjobs.size());
|
||||
Assert.assertEquals(1, noteRecordForTest.size());
|
||||
|
||||
Assert.assertEquals("[8,16,32,64,-8,-16,-32,-64,\"3.2\",\"6.4\",1,\"str_col\",\"abc\"," + "\"417ddc5d-e556-4d27-95dd-a34d84e46a50\",1580745600000,1580752800000,\"hello\",\"[1,2,3]\"," + "\"[\\\"abc\\\",\\\"cde\\\"]\",\"(8,'uint8_type')\",null,\"[1,2]\",\"[\\\"x\\\",\\\"y\\\"]\",\"127.0.0.1\",\"::\",\"23.345\"]", JSON.toJSONString(listData(noteRecordForTest.get(0))));
|
||||
}
|
||||
|
||||
@Override
|
||||
protected OutputStream buildDataOutput(String optionalOutputName) {
|
||||
File f = new File(optionalOutputName + "-output.txt");
|
||||
try {
|
||||
return new FileOutputStream(f);
|
||||
} catch (FileNotFoundException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getTestPluginName() {
|
||||
return "clickhousereader";
|
||||
}
|
||||
|
||||
private Object[] listData(Record record) {
|
||||
if (null == record) {
|
||||
return ArrayUtils.EMPTY_OBJECT_ARRAY;
|
||||
}
|
||||
Object[] arr = new Object[record.getColumnNumber()];
|
||||
for (int i = 0; i < arr.length; i++) {
|
||||
Column col = record.getColumn(i);
|
||||
if (null != col) {
|
||||
arr[i] = col.getRawData();
|
||||
}
|
||||
}
|
||||
return arr;
|
||||
}
|
||||
}
|
@ -5,6 +5,7 @@ import com.alibaba.datax.common.exception.DataXException;
|
||||
|
||||
import java.math.BigDecimal;
|
||||
import java.math.BigInteger;
|
||||
import java.sql.Time;
|
||||
import java.util.Date;
|
||||
|
||||
/**
|
||||
@ -12,18 +13,54 @@ import java.util.Date;
|
||||
*/
|
||||
public class DateColumn extends Column {
|
||||
|
||||
private DateType subType = DateType.DATETIME;
|
||||
private DateType subType = DateType.DATETIME;
|
||||
|
||||
public static enum DateType {
|
||||
DATE, TIME, DATETIME
|
||||
}
|
||||
private int nanos = 0;
|
||||
|
||||
/**
|
||||
* 构建值为null的DateColumn,使用Date子类型为DATETIME
|
||||
* */
|
||||
public DateColumn() {
|
||||
this((Long)null);
|
||||
}
|
||||
private int precision = -1;
|
||||
|
||||
public static enum DateType {
|
||||
DATE, TIME, DATETIME
|
||||
}
|
||||
|
||||
/**
|
||||
* 构建值为time(java.sql.Time)的DateColumn,使用Date子类型为TIME,只有时间,没有日期
|
||||
*/
|
||||
public DateColumn(Time time, int nanos, int jdbcPrecision) {
|
||||
this(time);
|
||||
if (time != null) {
|
||||
setNanos(nanos);
|
||||
}
|
||||
if (jdbcPrecision == 10) {
|
||||
setPrecision(0);
|
||||
}
|
||||
if (jdbcPrecision >= 12 && jdbcPrecision <= 17) {
|
||||
setPrecision(jdbcPrecision - 11);
|
||||
}
|
||||
}
|
||||
|
||||
public long getNanos() {
|
||||
return nanos;
|
||||
}
|
||||
|
||||
public void setNanos(int nanos) {
|
||||
this.nanos = nanos;
|
||||
}
|
||||
|
||||
public int getPrecision() {
|
||||
return precision;
|
||||
}
|
||||
|
||||
public void setPrecision(int precision) {
|
||||
this.precision = precision;
|
||||
}
|
||||
|
||||
/**
|
||||
* 构建值为null的DateColumn,使用Date子类型为DATETIME
|
||||
*/
|
||||
public DateColumn() {
|
||||
this((Long) null);
|
||||
}
|
||||
|
||||
/**
|
||||
* 构建值为stamp(Unix时间戳)的DateColumn,使用Date子类型为DATETIME
|
||||
|
@ -0,0 +1,34 @@
|
||||
package com.alibaba.datax.common.util;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* @author jitongchen
|
||||
* @date 2023/9/7 9:47 AM
|
||||
*/
|
||||
public class LimitLogger {
|
||||
|
||||
private static Map<String, Long> lastPrintTime = new HashMap<>();
|
||||
|
||||
public static void limit(String name, long limit, LoggerFunction function) {
|
||||
if (StringUtils.isBlank(name)) {
|
||||
name = "__all__";
|
||||
}
|
||||
if (limit <= 0) {
|
||||
function.apply();
|
||||
} else {
|
||||
if (!lastPrintTime.containsKey(name)) {
|
||||
lastPrintTime.put(name, System.currentTimeMillis());
|
||||
function.apply();
|
||||
} else {
|
||||
if (System.currentTimeMillis() > lastPrintTime.get(name) + limit) {
|
||||
lastPrintTime.put(name, System.currentTimeMillis());
|
||||
function.apply();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,10 @@
|
||||
package com.alibaba.datax.common.util;
|
||||
|
||||
/**
|
||||
* @author molin.lxd
|
||||
* @date 2021-05-09
|
||||
*/
|
||||
public interface LoggerFunction {
|
||||
|
||||
void apply();
|
||||
}
|
@ -168,6 +168,7 @@ public final class ConfigParser {
|
||||
boolean isDefaultPath = StringUtils.isBlank(pluginPath);
|
||||
if (isDefaultPath) {
|
||||
configuration.set("path", path);
|
||||
configuration.set("loadType","jarLoader");
|
||||
}
|
||||
|
||||
Configuration result = Configuration.newDefault();
|
||||
|
@ -15,7 +15,7 @@ import java.util.List;
|
||||
/**
|
||||
* 提供Jar隔离的加载机制,会把传入的路径、及其子路径、以及路径中的jar文件加入到class path。
|
||||
*/
|
||||
public class JarLoader extends URLClassLoader {
|
||||
public class JarLoader extends URLClassLoader{
|
||||
public JarLoader(String[] paths) {
|
||||
this(paths, JarLoader.class.getClassLoader());
|
||||
}
|
||||
|
@ -49,7 +49,7 @@ public class LoadUtil {
|
||||
/**
|
||||
* jarLoader的缓冲
|
||||
*/
|
||||
private static Map<String, JarLoader> jarLoaderCenter = new HashMap<String, JarLoader>();
|
||||
private static Map<String, JarLoader> jarLoaderCenter = new HashMap();
|
||||
|
||||
/**
|
||||
* 设置pluginConfigs,方便后面插件来获取
|
||||
|
@ -79,6 +79,8 @@ create table if not exsits datax.sample1(a string, b int64, c date, d timestamp,
|
||||
"writer": {
|
||||
"name": "databendwriter",
|
||||
"parameter": {
|
||||
"writeMode": "replace",
|
||||
"onConflictColumn": ["id"],
|
||||
"username": "databend",
|
||||
"password": "databend",
|
||||
"column": ["a", "b", "c", "d", "e", "f", "g"],
|
||||
@ -149,6 +151,16 @@ create table if not exsits datax.sample1(a string, b int64, c date, d timestamp,
|
||||
* 必选: 否
|
||||
* 默认值: 无
|
||||
* 示例: ["select count(*) from datax.sample1"]
|
||||
* writeMode
|
||||
* 描述:写入模式,支持 insert 和 replace 两种模式,默认为 insert。若为 replace,务必填写 onConflictColumn 参数
|
||||
* 必选:否
|
||||
* 默认值:insert
|
||||
* 示例:"replace"
|
||||
* onConflictColumn
|
||||
* 描述:on conflict 字段,指定 writeMode 为 replace 后,需要此参数
|
||||
* 必选:否
|
||||
* 默认值:无
|
||||
* 示例:["id","user"]
|
||||
|
||||
### 3.3 类型转化
|
||||
DataX中的数据类型可以转换为databend中的相应数据类型。下表显示了两种类型之间的对应关系。
|
||||
|
@ -142,6 +142,16 @@ create table if not exsits datax.sample1(a string, b int64, c date, d timestamp,
|
||||
* Description: A list of SQL statements that will be executed after the write operation.
|
||||
* Required: no
|
||||
* Default: none
|
||||
* writeMode
|
||||
* Description:The write mode, support `insert` and `replace` two mode.
|
||||
* Required:no
|
||||
* Default:insert
|
||||
* Example:"replace"
|
||||
* onConflictColumn
|
||||
* Description:On conflict fields list.
|
||||
* Required:no
|
||||
* Default:none
|
||||
* Example:["id","user"]
|
||||
|
||||
### 3.3 Type Convert
|
||||
Data types in datax can be converted to the corresponding data types in databend. The following table shows the correspondence between the two types.
|
||||
|
@ -17,7 +17,7 @@
|
||||
<dependency>
|
||||
<groupId>com.databend</groupId>
|
||||
<artifactId>databend-jdbc</artifactId>
|
||||
<version>0.0.7</version>
|
||||
<version>0.1.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.alibaba.datax</groupId>
|
||||
|
@ -17,20 +17,17 @@ import java.sql.*;
|
||||
import java.util.List;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class DatabendWriter extends Writer
|
||||
{
|
||||
public class DatabendWriter extends Writer {
|
||||
private static final DataBaseType DATABASE_TYPE = DataBaseType.Databend;
|
||||
|
||||
public static class Job
|
||||
extends Writer.Job
|
||||
{
|
||||
extends Writer.Job {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(Job.class);
|
||||
private Configuration originalConfig;
|
||||
private CommonRdbmsWriter.Job commonRdbmsWriterMaster;
|
||||
|
||||
@Override
|
||||
public void init()
|
||||
{
|
||||
public void init() throws DataXException {
|
||||
this.originalConfig = super.getPluginJobConf();
|
||||
this.commonRdbmsWriterMaster = new CommonRdbmsWriter.Job(DATABASE_TYPE);
|
||||
this.commonRdbmsWriterMaster.init(this.originalConfig);
|
||||
@ -39,8 +36,7 @@ public class DatabendWriter extends Writer
|
||||
}
|
||||
|
||||
@Override
|
||||
public void preCheck()
|
||||
{
|
||||
public void preCheck() {
|
||||
this.init();
|
||||
this.commonRdbmsWriterMaster.writerPreCheck(this.originalConfig, DATABASE_TYPE);
|
||||
}
|
||||
@ -67,8 +63,7 @@ public class DatabendWriter extends Writer
|
||||
}
|
||||
|
||||
|
||||
public static class Task extends Writer.Task
|
||||
{
|
||||
public static class Task extends Writer.Task {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(Task.class);
|
||||
|
||||
private Configuration writerSliceConfig;
|
||||
@ -76,11 +71,10 @@ public class DatabendWriter extends Writer
|
||||
private CommonRdbmsWriter.Task commonRdbmsWriterSlave;
|
||||
|
||||
@Override
|
||||
public void init()
|
||||
{
|
||||
public void init() {
|
||||
this.writerSliceConfig = super.getPluginJobConf();
|
||||
|
||||
this.commonRdbmsWriterSlave = new CommonRdbmsWriter.Task(DataBaseType.Databend){
|
||||
this.commonRdbmsWriterSlave = new CommonRdbmsWriter.Task(DataBaseType.Databend) {
|
||||
@Override
|
||||
protected PreparedStatement fillPreparedStatementColumnType(PreparedStatement preparedStatement, int columnIndex, int columnSqltype, String typeName, Column column) throws SQLException {
|
||||
try {
|
||||
@ -177,8 +171,8 @@ public class DatabendWriter extends Writer
|
||||
|
||||
case Types.BOOLEAN:
|
||||
|
||||
// warn: bit(1) -> Types.BIT 可使用setBoolean
|
||||
// warn: bit(>1) -> Types.VARBINARY 可使用setBytes
|
||||
// warn: bit(1) -> Types.BIT 可使用setBoolean
|
||||
// warn: bit(>1) -> Types.VARBINARY 可使用setBytes
|
||||
case Types.BIT:
|
||||
if (this.dataBaseType == DataBaseType.MySql) {
|
||||
Boolean asBoolean = column.asBoolean();
|
||||
@ -224,8 +218,7 @@ public class DatabendWriter extends Writer
|
||||
}
|
||||
|
||||
@Override
|
||||
public void destroy()
|
||||
{
|
||||
public void destroy() {
|
||||
this.commonRdbmsWriterSlave.destroy(this.writerSliceConfig);
|
||||
}
|
||||
|
||||
@ -238,9 +231,9 @@ public class DatabendWriter extends Writer
|
||||
public void post() {
|
||||
this.commonRdbmsWriterSlave.post(this.writerSliceConfig);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void startWrite(RecordReceiver lineReceiver)
|
||||
{
|
||||
public void startWrite(RecordReceiver lineReceiver) {
|
||||
this.commonRdbmsWriterSlave.startWrite(lineReceiver, this.writerSliceConfig, this.getTaskPluginCollector());
|
||||
}
|
||||
|
||||
|
@ -0,0 +1,33 @@
|
||||
package com.alibaba.datax.plugin.writer.databendwriter;
|
||||
|
||||
import com.alibaba.datax.common.spi.ErrorCode;
|
||||
|
||||
|
||||
public enum DatabendWriterErrorCode implements ErrorCode {
|
||||
CONF_ERROR("DatabendWriter-00", "配置错误."),
|
||||
WRITE_DATA_ERROR("DatabendWriter-01", "写入数据时失败."),
|
||||
;
|
||||
|
||||
private final String code;
|
||||
private final String description;
|
||||
|
||||
private DatabendWriterErrorCode(String code, String description) {
|
||||
this.code = code;
|
||||
this.description = description;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getCode() {
|
||||
return this.code;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getDescription() {
|
||||
return this.description;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return String.format("Code:[%s], Description:[%s].", this.code, this.description);
|
||||
}
|
||||
}
|
@ -1,40 +1,72 @@
|
||||
package com.alibaba.datax.plugin.writer.databendwriter.util;
|
||||
|
||||
import com.alibaba.datax.common.exception.DataXException;
|
||||
import com.alibaba.datax.common.util.Configuration;
|
||||
import com.alibaba.datax.plugin.rdbms.writer.Constant;
|
||||
import com.alibaba.datax.plugin.rdbms.writer.Key;
|
||||
|
||||
import com.alibaba.datax.plugin.writer.databendwriter.DatabendWriterErrorCode;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.xml.crypto.Data;
|
||||
import java.util.List;
|
||||
import java.util.StringJoiner;
|
||||
|
||||
public final class DatabendWriterUtil
|
||||
{
|
||||
public final class DatabendWriterUtil {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(DatabendWriterUtil.class);
|
||||
|
||||
private DatabendWriterUtil() {}
|
||||
public static void dealWriteMode(Configuration originalConfig)
|
||||
{
|
||||
private DatabendWriterUtil() {
|
||||
}
|
||||
|
||||
public static void dealWriteMode(Configuration originalConfig) throws DataXException {
|
||||
List<String> columns = originalConfig.getList(Key.COLUMN, String.class);
|
||||
List<String> onConflictColumns = originalConfig.getList(Key.ONCONFLICT_COLUMN, String.class);
|
||||
StringBuilder writeDataSqlTemplate = new StringBuilder();
|
||||
|
||||
String jdbcUrl = originalConfig.getString(String.format("%s[0].%s",
|
||||
Constant.CONN_MARK, Key.JDBC_URL, String.class));
|
||||
|
||||
String writeMode = originalConfig.getString(Key.WRITE_MODE, "INSERT");
|
||||
LOG.info("write mode is {}", writeMode);
|
||||
if (writeMode.toLowerCase().contains("replace")) {
|
||||
if (onConflictColumns == null || onConflictColumns.size() == 0) {
|
||||
throw DataXException
|
||||
.asDataXException(
|
||||
DatabendWriterErrorCode.CONF_ERROR,
|
||||
String.format(
|
||||
"Replace mode must has onConflictColumn config."
|
||||
));
|
||||
}
|
||||
|
||||
StringBuilder writeDataSqlTemplate = new StringBuilder();
|
||||
writeDataSqlTemplate.append("INSERT INTO %s");
|
||||
StringJoiner columnString = new StringJoiner(",");
|
||||
// for databend if you want to use replace mode, the writeMode should be: "writeMode": "replace"
|
||||
writeDataSqlTemplate.append("REPLACE INTO %s (")
|
||||
.append(StringUtils.join(columns, ",")).append(") ").append(onConFlictDoString(onConflictColumns))
|
||||
.append(" VALUES");
|
||||
|
||||
for (String column : columns) {
|
||||
columnString.add(column);
|
||||
LOG.info("Replace data [\n{}\n], which jdbcUrl like:[{}]", writeDataSqlTemplate, jdbcUrl);
|
||||
originalConfig.set(Constant.INSERT_OR_REPLACE_TEMPLATE_MARK, writeDataSqlTemplate);
|
||||
} else {
|
||||
writeDataSqlTemplate.append("INSERT INTO %s");
|
||||
StringJoiner columnString = new StringJoiner(",");
|
||||
|
||||
for (String column : columns) {
|
||||
columnString.add(column);
|
||||
}
|
||||
writeDataSqlTemplate.append(String.format("(%s)", columnString));
|
||||
writeDataSqlTemplate.append(" VALUES");
|
||||
|
||||
LOG.info("Insert data [\n{}\n], which jdbcUrl like:[{}]", writeDataSqlTemplate, jdbcUrl);
|
||||
|
||||
originalConfig.set(Constant.INSERT_OR_REPLACE_TEMPLATE_MARK, writeDataSqlTemplate);
|
||||
}
|
||||
writeDataSqlTemplate.append(String.format("(%s)", columnString));
|
||||
writeDataSqlTemplate.append(" VALUES");
|
||||
|
||||
LOG.info("Write data [\n{}\n], which jdbcUrl like:[{}]", writeDataSqlTemplate, jdbcUrl);
|
||||
|
||||
originalConfig.set(Constant.INSERT_OR_REPLACE_TEMPLATE_MARK, writeDataSqlTemplate);
|
||||
}
|
||||
}
|
||||
|
||||
public static String onConFlictDoString(List<String> conflictColumns) {
|
||||
return " ON " +
|
||||
"(" +
|
||||
StringUtils.join(conflictColumns, ",") + ") ";
|
||||
}
|
||||
}
|
||||
|
20
datax-example/datax-example-core/pom.xml
Normal file
20
datax-example/datax-example-core/pom.xml
Normal file
@ -0,0 +1,20 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<parent>
|
||||
<groupId>com.alibaba.datax</groupId>
|
||||
<artifactId>datax-example</artifactId>
|
||||
<version>0.0.1-SNAPSHOT</version>
|
||||
</parent>
|
||||
|
||||
<artifactId>datax-example-core</artifactId>
|
||||
|
||||
<properties>
|
||||
<maven.compiler.source>8</maven.compiler.source>
|
||||
<maven.compiler.target>8</maven.compiler.target>
|
||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||
</properties>
|
||||
|
||||
</project>
|
@ -0,0 +1,26 @@
|
||||
package com.alibaba.datax.example;
|
||||
|
||||
import com.alibaba.datax.common.util.Configuration;
|
||||
import com.alibaba.datax.core.Engine;
|
||||
import com.alibaba.datax.example.util.ExampleConfigParser;
|
||||
|
||||
/**
|
||||
* {@code Date} 2023/8/6 11:22
|
||||
*
|
||||
* @author fuyouj
|
||||
*/
|
||||
|
||||
public class ExampleContainer {
|
||||
/**
|
||||
* example对外暴露的启动入口
|
||||
* 使用前最好看下 datax-example/doc/README.MD
|
||||
* @param jobPath 任务json绝对路径
|
||||
*/
|
||||
public static void start(String jobPath) {
|
||||
|
||||
Configuration configuration = ExampleConfigParser.parse(jobPath);
|
||||
|
||||
Engine engine = new Engine();
|
||||
engine.start(configuration);
|
||||
}
|
||||
}
|
@ -0,0 +1,23 @@
|
||||
package com.alibaba.datax.example;
|
||||
|
||||
|
||||
import com.alibaba.datax.example.util.PathUtil;
|
||||
|
||||
/**
|
||||
* @author fuyouj
|
||||
*/
|
||||
public class Main {
|
||||
|
||||
/**
|
||||
* 1.在example模块pom文件添加你依赖的的调试插件,
|
||||
* 你可以直接打开本模块的pom文件,参考是如何引入streamreader,streamwriter
|
||||
* 2. 在此处指定你的job文件
|
||||
*/
|
||||
public static void main(String[] args) {
|
||||
|
||||
String classPathJobPath = "/job/stream2stream.json";
|
||||
String absJobPath = PathUtil.getAbsolutePathFromClassPath(classPathJobPath);
|
||||
ExampleContainer.start(absJobPath);
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,154 @@
|
||||
package com.alibaba.datax.example.util;
|
||||
|
||||
import com.alibaba.datax.common.exception.DataXException;
|
||||
import com.alibaba.datax.common.util.Configuration;
|
||||
import com.alibaba.datax.core.util.ConfigParser;
|
||||
import com.alibaba.datax.core.util.FrameworkErrorCode;
|
||||
import com.alibaba.datax.core.util.container.CoreConstant;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.net.URL;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* @author fuyouj
|
||||
*/
|
||||
public class ExampleConfigParser {
|
||||
private static final String CORE_CONF = "/example/conf/core.json";
|
||||
|
||||
private static final String PLUGIN_DESC_FILE = "plugin.json";
|
||||
|
||||
/**
|
||||
* 指定Job配置路径,ConfigParser会解析Job、Plugin、Core全部信息,并以Configuration返回
|
||||
* 不同于Core的ConfigParser,这里的core,plugin 不依赖于编译后的datax.home,而是扫描程序编译后的target目录
|
||||
*/
|
||||
public static Configuration parse(final String jobPath) {
|
||||
|
||||
Configuration configuration = ConfigParser.parseJobConfig(jobPath);
|
||||
configuration.merge(coreConfig(),
|
||||
false);
|
||||
|
||||
Map<String, String> pluginTypeMap = new HashMap<>();
|
||||
String readerName = configuration.getString(CoreConstant.DATAX_JOB_CONTENT_READER_NAME);
|
||||
String writerName = configuration.getString(CoreConstant.DATAX_JOB_CONTENT_WRITER_NAME);
|
||||
pluginTypeMap.put(readerName, "reader");
|
||||
pluginTypeMap.put(writerName, "writer");
|
||||
Configuration pluginsDescConfig = parsePluginsConfig(pluginTypeMap);
|
||||
configuration.merge(pluginsDescConfig, false);
|
||||
return configuration;
|
||||
}
|
||||
|
||||
private static Configuration parsePluginsConfig(Map<String, String> pluginTypeMap) {
|
||||
|
||||
Configuration configuration = Configuration.newDefault();
|
||||
|
||||
//最初打算通过user.dir获取工作目录来扫描插件,
|
||||
//但是user.dir在不同有一些不确定性,所以废弃了这个选择
|
||||
|
||||
for (File basePackage : runtimeBasePackages()) {
|
||||
if (pluginTypeMap.isEmpty()) {
|
||||
break;
|
||||
}
|
||||
scanPluginByPackage(basePackage, configuration, basePackage.listFiles(), pluginTypeMap);
|
||||
}
|
||||
if (!pluginTypeMap.isEmpty()) {
|
||||
String failedPlugin = pluginTypeMap.keySet().toString();
|
||||
String message = "\nplugin %s load failed :ry to analyze the reasons from the following aspects.。\n" +
|
||||
"1: Check if the name of the plugin is spelled correctly, and verify whether DataX supports this plugin\n" +
|
||||
"2:Verify if the <resource></resource> tag has been added under <build></build> section in the pom file of the relevant plugin.\n<resource>" +
|
||||
" <directory>src/main/resources</directory>\n" +
|
||||
" <includes>\n" +
|
||||
" <include>**/*.*</include>\n" +
|
||||
" </includes>\n" +
|
||||
" <filtering>true</filtering>\n" +
|
||||
" </resource>\n [Refer to the streamreader pom file] \n" +
|
||||
"3: Check that the datax-yourPlugin-example module imported your test plugin";
|
||||
message = String.format(message, failedPlugin);
|
||||
throw DataXException.asDataXException(FrameworkErrorCode.PLUGIN_INIT_ERROR, message);
|
||||
}
|
||||
return configuration;
|
||||
}
|
||||
|
||||
/**
|
||||
* 通过classLoader获取程序编译的输出目录
|
||||
*
|
||||
* @return File[/datax-example/target/classes,xxReader/target/classes,xxWriter/target/classes]
|
||||
*/
|
||||
private static File[] runtimeBasePackages() {
|
||||
List<File> basePackages = new ArrayList<>();
|
||||
ClassLoader classLoader = Thread.currentThread().getContextClassLoader();
|
||||
Enumeration<URL> resources = null;
|
||||
try {
|
||||
resources = classLoader.getResources("");
|
||||
} catch (IOException e) {
|
||||
throw DataXException.asDataXException(e.getMessage());
|
||||
}
|
||||
|
||||
while (resources.hasMoreElements()) {
|
||||
URL resource = resources.nextElement();
|
||||
File file = new File(resource.getFile());
|
||||
if (file.isDirectory()) {
|
||||
basePackages.add(file);
|
||||
}
|
||||
}
|
||||
|
||||
return basePackages.toArray(new File[0]);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param packageFile 编译出来的target/classes根目录 便于找到插件时设置插件的URL目录,设置根目录是最保险的方式
|
||||
* @param configuration pluginConfig
|
||||
* @param files 待扫描文件
|
||||
* @param needPluginTypeMap 需要的插件
|
||||
*/
|
||||
private static void scanPluginByPackage(File packageFile,
|
||||
Configuration configuration,
|
||||
File[] files,
|
||||
Map<String, String> needPluginTypeMap) {
|
||||
if (files == null) {
|
||||
return;
|
||||
}
|
||||
for (File file : files) {
|
||||
if (file.isFile() && PLUGIN_DESC_FILE.equals(file.getName())) {
|
||||
Configuration pluginDesc = Configuration.from(file);
|
||||
String descPluginName = pluginDesc.getString("name", "");
|
||||
|
||||
if (needPluginTypeMap.containsKey(descPluginName)) {
|
||||
|
||||
String type = needPluginTypeMap.get(descPluginName);
|
||||
configuration.merge(parseOnePlugin(packageFile.getAbsolutePath(), type, descPluginName, pluginDesc), false);
|
||||
needPluginTypeMap.remove(descPluginName);
|
||||
|
||||
}
|
||||
} else {
|
||||
scanPluginByPackage(packageFile, configuration, file.listFiles(), needPluginTypeMap);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static Configuration parseOnePlugin(String packagePath,
|
||||
String pluginType,
|
||||
String pluginName,
|
||||
Configuration pluginDesc) {
|
||||
//设置path 兼容jarLoader的加载方式URLClassLoader
|
||||
pluginDesc.set("path", packagePath);
|
||||
Configuration pluginConfInJob = Configuration.newDefault();
|
||||
pluginConfInJob.set(
|
||||
String.format("plugin.%s.%s", pluginType, pluginName),
|
||||
pluginDesc.getInternal());
|
||||
return pluginConfInJob;
|
||||
}
|
||||
|
||||
private static Configuration coreConfig() {
|
||||
try {
|
||||
URL resource = ExampleConfigParser.class.getResource(CORE_CONF);
|
||||
return Configuration.from(Paths.get(resource.toURI()).toFile());
|
||||
} catch (Exception ignore) {
|
||||
throw DataXException.asDataXException("Failed to load the configuration file core.json. " +
|
||||
"Please check whether /example/conf/core.json exists!");
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,26 @@
|
||||
package com.alibaba.datax.example.util;
|
||||
|
||||
|
||||
import com.alibaba.datax.common.exception.DataXException;
|
||||
|
||||
import java.net.URI;
|
||||
import java.net.URISyntaxException;
|
||||
import java.net.URL;
|
||||
import java.nio.file.Paths;
|
||||
|
||||
/**
|
||||
* @author fuyouj
|
||||
*/
|
||||
public class PathUtil {
|
||||
public static String getAbsolutePathFromClassPath(String path) {
|
||||
URL resource = PathUtil.class.getResource(path);
|
||||
try {
|
||||
assert resource != null;
|
||||
URI uri = resource.toURI();
|
||||
return Paths.get(uri).toString();
|
||||
} catch (NullPointerException | URISyntaxException e) {
|
||||
throw DataXException.asDataXException("path error,please check whether the path is correct");
|
||||
}
|
||||
|
||||
}
|
||||
}
|
60
datax-example/datax-example-core/src/main/resources/example/conf/core.json
Executable file
60
datax-example/datax-example-core/src/main/resources/example/conf/core.json
Executable file
@ -0,0 +1,60 @@
|
||||
{
|
||||
"entry": {
|
||||
"jvm": "-Xms1G -Xmx1G",
|
||||
"environment": {}
|
||||
},
|
||||
"common": {
|
||||
"column": {
|
||||
"datetimeFormat": "yyyy-MM-dd HH:mm:ss",
|
||||
"timeFormat": "HH:mm:ss",
|
||||
"dateFormat": "yyyy-MM-dd",
|
||||
"extraFormats":["yyyyMMdd"],
|
||||
"timeZone": "GMT+8",
|
||||
"encoding": "utf-8"
|
||||
}
|
||||
},
|
||||
"core": {
|
||||
"dataXServer": {
|
||||
"address": "http://localhost:7001/api",
|
||||
"timeout": 10000,
|
||||
"reportDataxLog": false,
|
||||
"reportPerfLog": false
|
||||
},
|
||||
"transport": {
|
||||
"channel": {
|
||||
"class": "com.alibaba.datax.core.transport.channel.memory.MemoryChannel",
|
||||
"speed": {
|
||||
"byte": -1,
|
||||
"record": -1
|
||||
},
|
||||
"flowControlInterval": 20,
|
||||
"capacity": 512,
|
||||
"byteCapacity": 67108864
|
||||
},
|
||||
"exchanger": {
|
||||
"class": "com.alibaba.datax.core.plugin.BufferedRecordExchanger",
|
||||
"bufferSize": 32
|
||||
}
|
||||
},
|
||||
"container": {
|
||||
"job": {
|
||||
"reportInterval": 10000
|
||||
},
|
||||
"taskGroup": {
|
||||
"channel": 5
|
||||
},
|
||||
"trace": {
|
||||
"enable": "false"
|
||||
}
|
||||
|
||||
},
|
||||
"statistics": {
|
||||
"collector": {
|
||||
"plugin": {
|
||||
"taskClass": "com.alibaba.datax.core.statistics.plugin.task.StdoutPluginCollector",
|
||||
"maxDirtyNumber": 10
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,19 @@
|
||||
package com.alibaba.datax.example.util;
|
||||
|
||||
import org.junit.Assert;
|
||||
import org.junit.Test;
|
||||
|
||||
/**
|
||||
* {@code Author} FuYouJ
|
||||
* {@code Date} 2023/8/19 21:38
|
||||
*/
|
||||
|
||||
public class PathUtilTest {
|
||||
|
||||
@Test
|
||||
public void testParseClassPathFile() {
|
||||
String path = "/pathTest.json";
|
||||
String absolutePathFromClassPath = PathUtil.getAbsolutePathFromClassPath(path);
|
||||
Assert.assertNotNull(absolutePathFromClassPath);
|
||||
}
|
||||
}
|
@ -0,0 +1 @@
|
||||
{}
|
43
datax-example/datax-example-neo4j/pom.xml
Normal file
43
datax-example/datax-example-neo4j/pom.xml
Normal file
@ -0,0 +1,43 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<parent>
|
||||
<groupId>com.alibaba.datax</groupId>
|
||||
<artifactId>datax-example</artifactId>
|
||||
<version>0.0.1-SNAPSHOT</version>
|
||||
</parent>
|
||||
|
||||
<artifactId>datax-example-neo4j</artifactId>
|
||||
|
||||
<properties>
|
||||
<maven.compiler.source>8</maven.compiler.source>
|
||||
<maven.compiler.target>8</maven.compiler.target>
|
||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||
<test.container.version>1.17.6</test.container.version>
|
||||
<neo4j-java-driver.version>4.4.9</neo4j-java-driver.version>
|
||||
</properties>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>com.alibaba.datax</groupId>
|
||||
<artifactId>datax-example-core</artifactId>
|
||||
<version>0.0.1-SNAPSHOT</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.testcontainers</groupId>
|
||||
<artifactId>testcontainers</artifactId>
|
||||
<version>${test.container.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.alibaba.datax</groupId>
|
||||
<artifactId>neo4jwriter</artifactId>
|
||||
<version>0.0.1-SNAPSHOT</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.alibaba.datax</groupId>
|
||||
<artifactId>datax-example-streamreader</artifactId>
|
||||
<version>0.0.1-SNAPSHOT</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
</project>
|
@ -0,0 +1,138 @@
|
||||
package com.alibaba.datax.example.neo4j;
|
||||
|
||||
import com.alibaba.datax.example.ExampleContainer;
|
||||
import com.alibaba.datax.example.util.PathUtil;
|
||||
import org.junit.After;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
import org.neo4j.driver.*;
|
||||
import org.neo4j.driver.types.Node;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.testcontainers.containers.GenericContainer;
|
||||
import org.testcontainers.containers.Network;
|
||||
import org.testcontainers.containers.output.Slf4jLogConsumer;
|
||||
import org.testcontainers.lifecycle.Startables;
|
||||
import org.testcontainers.shaded.org.awaitility.Awaitility;
|
||||
import org.testcontainers.utility.DockerImageName;
|
||||
import org.testcontainers.utility.DockerLoggerFactory;
|
||||
|
||||
import java.net.URI;
|
||||
import java.util.Arrays;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
/**
|
||||
* {@code Author} FuYouJ
|
||||
* {@code Date} 2023/8/19 21:48
|
||||
*/
|
||||
|
||||
public class StreamReader2Neo4jWriterTest {
|
||||
private static final Logger LOGGER = LoggerFactory.getLogger(StreamReader2Neo4jWriterTest.class);
|
||||
private static final String CONTAINER_IMAGE = "neo4j:5.9.0";
|
||||
|
||||
private static final String CONTAINER_HOST = "neo4j-host";
|
||||
private static final int HTTP_PORT = 7474;
|
||||
private static final int BOLT_PORT = 7687;
|
||||
private static final String CONTAINER_NEO4J_USERNAME = "neo4j";
|
||||
private static final String CONTAINER_NEO4J_PASSWORD = "Test@12343";
|
||||
private static final URI CONTAINER_URI = URI.create("neo4j://localhost:" + BOLT_PORT);
|
||||
|
||||
protected static final Network NETWORK = Network.newNetwork();
|
||||
|
||||
private GenericContainer<?> container;
|
||||
protected Driver neo4jDriver;
|
||||
protected Session neo4jSession;
|
||||
private static final int CHANNEL = 5;
|
||||
private static final int READER_NUM = 10;
|
||||
|
||||
@Before
|
||||
public void init() {
|
||||
DockerImageName imageName = DockerImageName.parse(CONTAINER_IMAGE);
|
||||
container =
|
||||
new GenericContainer<>(imageName)
|
||||
.withNetwork(NETWORK)
|
||||
.withNetworkAliases(CONTAINER_HOST)
|
||||
.withExposedPorts(HTTP_PORT, BOLT_PORT)
|
||||
.withEnv(
|
||||
"NEO4J_AUTH",
|
||||
CONTAINER_NEO4J_USERNAME + "/" + CONTAINER_NEO4J_PASSWORD)
|
||||
.withEnv("apoc.export.file.enabled", "true")
|
||||
.withEnv("apoc.import.file.enabled", "true")
|
||||
.withEnv("apoc.import.file.use_neo4j_config", "true")
|
||||
.withEnv("NEO4J_PLUGINS", "[\"apoc\"]")
|
||||
.withLogConsumer(
|
||||
new Slf4jLogConsumer(
|
||||
DockerLoggerFactory.getLogger(CONTAINER_IMAGE)));
|
||||
container.setPortBindings(
|
||||
Arrays.asList(
|
||||
String.format("%s:%s", HTTP_PORT, HTTP_PORT),
|
||||
String.format("%s:%s", BOLT_PORT, BOLT_PORT)));
|
||||
Startables.deepStart(Stream.of(container)).join();
|
||||
LOGGER.info("container started");
|
||||
Awaitility.given()
|
||||
.ignoreExceptions()
|
||||
.await()
|
||||
.atMost(30, TimeUnit.SECONDS)
|
||||
.untilAsserted(this::initConnection);
|
||||
}
|
||||
|
||||
//在neo4jWriter模块使用Example测试整个job,方便发现整个流程的代码问题
|
||||
@Test
|
||||
public void streamReader2Neo4j() {
|
||||
|
||||
deleteHistoryIfExist();
|
||||
|
||||
String path = "/streamreader2neo4j.json";
|
||||
String jobPath = PathUtil.getAbsolutePathFromClassPath(path);
|
||||
|
||||
ExampleContainer.start(jobPath);
|
||||
|
||||
//根据channel和reader的mock数据,校验结果集是否符合预期
|
||||
verifyWriteResult();
|
||||
}
|
||||
|
||||
private void deleteHistoryIfExist() {
|
||||
String query = "match (n:StreamReader) return n limit 1";
|
||||
String delete = "match (n:StreamReader) delete n";
|
||||
if (neo4jSession.run(query).hasNext()) {
|
||||
neo4jSession.run(delete);
|
||||
}
|
||||
}
|
||||
|
||||
private void verifyWriteResult() {
|
||||
int total = CHANNEL * READER_NUM;
|
||||
String query = "match (n:StreamReader) return n";
|
||||
Result run = neo4jSession.run(query);
|
||||
int count = 0;
|
||||
while (run.hasNext()) {
|
||||
Record record = run.next();
|
||||
Node node = record.get("n").asNode();
|
||||
if (node.hasLabel("StreamReader")) {
|
||||
count++;
|
||||
}
|
||||
}
|
||||
Assert.assertEquals(count, total);
|
||||
}
|
||||
@After
|
||||
public void destroy() {
|
||||
if (neo4jSession != null) {
|
||||
neo4jSession.close();
|
||||
}
|
||||
if (neo4jDriver != null) {
|
||||
neo4jDriver.close();
|
||||
}
|
||||
if (container != null) {
|
||||
container.close();
|
||||
}
|
||||
}
|
||||
|
||||
private void initConnection() {
|
||||
neo4jDriver =
|
||||
GraphDatabase.driver(
|
||||
CONTAINER_URI,
|
||||
AuthTokens.basic(CONTAINER_NEO4J_USERNAME, CONTAINER_NEO4J_PASSWORD));
|
||||
neo4jSession = neo4jDriver.session(SessionConfig.forDatabase("neo4j"));
|
||||
}
|
||||
}
|
@ -0,0 +1,51 @@
|
||||
{
|
||||
"job": {
|
||||
"content": [
|
||||
{
|
||||
"reader": {
|
||||
"name": "streamreader",
|
||||
"parameter": {
|
||||
"sliceRecordCount": 10,
|
||||
"column": [
|
||||
{
|
||||
"type": "string",
|
||||
"value": "StreamReader"
|
||||
},
|
||||
{
|
||||
"type": "string",
|
||||
"value": "1997"
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"writer": {
|
||||
"name": "neo4jWriter",
|
||||
"parameter": {
|
||||
"uri": "bolt://localhost:7687",
|
||||
"username":"neo4j",
|
||||
"password":"Test@12343",
|
||||
"database":"neo4j",
|
||||
"cypher": "unwind $batch as row CALL apoc.cypher.doIt( 'create (n:`' + row.Label + '`{id:$id})' ,{id: row.id} ) YIELD value RETURN 1 ",
|
||||
"batchDataVariableName": "batch",
|
||||
"batchSize": "3",
|
||||
"properties": [
|
||||
{
|
||||
"name": "Label",
|
||||
"type": "string"
|
||||
},
|
||||
{
|
||||
"name": "id",
|
||||
"type": "STRING"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"setting": {
|
||||
"speed": {
|
||||
"channel": 5
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
37
datax-example/datax-example-streamreader/pom.xml
Normal file
37
datax-example/datax-example-streamreader/pom.xml
Normal file
@ -0,0 +1,37 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<parent>
|
||||
<groupId>com.alibaba.datax</groupId>
|
||||
<artifactId>datax-example</artifactId>
|
||||
<version>0.0.1-SNAPSHOT</version>
|
||||
</parent>
|
||||
|
||||
<artifactId>datax-example-streamreader</artifactId>
|
||||
|
||||
<properties>
|
||||
<maven.compiler.source>8</maven.compiler.source>
|
||||
<maven.compiler.target>8</maven.compiler.target>
|
||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||
</properties>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>com.alibaba.datax</groupId>
|
||||
<artifactId>datax-example-core</artifactId>
|
||||
<version>0.0.1-SNAPSHOT</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.alibaba.datax</groupId>
|
||||
<artifactId>streamreader</artifactId>
|
||||
<version>0.0.1-SNAPSHOT</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.alibaba.datax</groupId>
|
||||
<artifactId>streamwriter</artifactId>
|
||||
<version>0.0.1-SNAPSHOT</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
</project>
|
@ -0,0 +1,19 @@
|
||||
package com.alibaba.datax.example.streamreader;
|
||||
|
||||
import com.alibaba.datax.example.ExampleContainer;
|
||||
import com.alibaba.datax.example.util.PathUtil;
|
||||
import org.junit.Test;
|
||||
|
||||
/**
|
||||
* {@code Author} FuYouJ
|
||||
* {@code Date} 2023/8/14 20:16
|
||||
*/
|
||||
|
||||
public class StreamReader2StreamWriterTest {
|
||||
@Test
|
||||
public void testStreamReader2StreamWriter() {
|
||||
String path = "/stream2stream.json";
|
||||
String jobPath = PathUtil.getAbsolutePathFromClassPath(path);
|
||||
ExampleContainer.start(jobPath);
|
||||
}
|
||||
}
|
@ -0,0 +1,36 @@
|
||||
{
|
||||
"job": {
|
||||
"content": [
|
||||
{
|
||||
"reader": {
|
||||
"name": "streamreader",
|
||||
"parameter": {
|
||||
"sliceRecordCount": 10,
|
||||
"column": [
|
||||
{
|
||||
"type": "long",
|
||||
"value": "10"
|
||||
},
|
||||
{
|
||||
"type": "string",
|
||||
"value": "hello,你好,世界-DataX"
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"writer": {
|
||||
"name": "streamwriter",
|
||||
"parameter": {
|
||||
"encoding": "UTF-8",
|
||||
"print": true
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"setting": {
|
||||
"speed": {
|
||||
"channel": 5
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
107
datax-example/doc/README.md
Normal file
107
datax-example/doc/README.md
Normal file
@ -0,0 +1,107 @@
|
||||
## [DataX-Example]调试datax插件的模块
|
||||
|
||||
### 为什么要开发这个模块
|
||||
|
||||
一般使用DataX启动数据同步任务是从datax.py 脚本开始,获取程序datax包目录设置到系统变量datax.home里,此后系统核心插件的加载,配置初始化均依赖于变量datax.home,这带来了一些麻烦,以一次本地 DeBug streamreader 插件为例。
|
||||
|
||||
- maven 打包 datax 生成 datax 目录
|
||||
- 在 IDE 中 设置系统环境变量 datax.home,或者在Engine启动类中硬编码设置datax.home。
|
||||
- 修改插件 streamreader 代码
|
||||
- 再次 maven 打包,使JarLoader 能够加载到最新的 streamreader 代码。
|
||||
- 调试代码
|
||||
|
||||
在以上步骤中,打包完全不必要且最耗时,等待打包也最煎熬。
|
||||
|
||||
所以我编写一个新的模块(datax-example),此模块特用于本地调试和复现 BUG。如果模块顺利编写完成,那么以上流程将被简化至两步。
|
||||
|
||||
- 修改插件 streamreader 代码。
|
||||
- 调试代码
|
||||
|
||||
<img src="img/img01.png" alt="img" style="zoom:40%;" />
|
||||
|
||||
### 目录结构
|
||||
该目录结构演示了如何使用datax-example-core编写测试用例,和校验代码流程。
|
||||
<img src="img/img03.png" alt="img" style="zoom:100%;" />
|
||||
|
||||
### 实现原理
|
||||
|
||||
- 不修改原有的ConfigParer,使用新的ExampleConfigParser,仅用于example模块。他不依赖datax.home,而是依赖ide编译后的target目录
|
||||
- 将ide的target目录作为每个插件的目录类加载目录。
|
||||
|
||||

|
||||
|
||||
### 如何使用
|
||||
1.修改插件的pom文件,做如下改动。以streamreader为例。<br/>
|
||||
改动前
|
||||
```xml
|
||||
<build>
|
||||
<plugins>
|
||||
<!-- compiler plugin -->
|
||||
<plugin>
|
||||
<artifactId>maven-compiler-plugin</artifactId>
|
||||
<configuration>
|
||||
<source>${jdk-version}</source>
|
||||
<target>${jdk-version}</target>
|
||||
<encoding>${project-sourceEncoding}</encoding>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
```
|
||||
改动后
|
||||
```xml
|
||||
<build>
|
||||
<resources>
|
||||
<!--将resource目录也输出到target-->
|
||||
<resource>
|
||||
<directory>src/main/resources</directory>
|
||||
<includes>
|
||||
<include>**/*.*</include>
|
||||
</includes>
|
||||
<filtering>true</filtering>
|
||||
</resource>
|
||||
</resources>
|
||||
<plugins>
|
||||
<!-- compiler plugin -->
|
||||
<plugin>
|
||||
<artifactId>maven-compiler-plugin</artifactId>
|
||||
<configuration>
|
||||
<source>${jdk-version}</source>
|
||||
<target>${jdk-version}</target>
|
||||
<encoding>${project-sourceEncoding}</encoding>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
```
|
||||
#### 在测试模块模块使用
|
||||
参考datax-example/datax-example-streamreader的StreamReader2StreamWriterTest.java
|
||||
```java
|
||||
public class StreamReader2StreamWriterTest {
|
||||
@Test
|
||||
public void testStreamReader2StreamWriter() {
|
||||
String path = "/stream2stream.json";
|
||||
String jobPath = PathUtil.getAbsolutePathFromClassPath(path);
|
||||
ExampleContainer.start(jobPath);
|
||||
}
|
||||
}
|
||||
|
||||
```
|
||||
参考datax-example/datax-example-neo4j的StreamReader2Neo4jWriterTest
|
||||
```java
|
||||
public class StreamReader2Neo4jWriterTest{
|
||||
@Test
|
||||
public void streamReader2Neo4j() {
|
||||
|
||||
deleteHistoryIfExist();
|
||||
|
||||
String path = "/streamreader2neo4j.json";
|
||||
String jobPath = PathUtil.getAbsolutePathFromClassPath(path);
|
||||
|
||||
ExampleContainer.start(jobPath);
|
||||
|
||||
//根据channel和reader的mock数据,校验结果集是否符合预期
|
||||
verifyWriteResult();
|
||||
}
|
||||
}
|
||||
```
|
BIN
datax-example/doc/img/img01.png
Normal file
BIN
datax-example/doc/img/img01.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 71 KiB |
BIN
datax-example/doc/img/img02.png
Normal file
BIN
datax-example/doc/img/img02.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 66 KiB |
BIN
datax-example/doc/img/img03.png
Normal file
BIN
datax-example/doc/img/img03.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 43 KiB |
68
datax-example/pom.xml
Normal file
68
datax-example/pom.xml
Normal file
@ -0,0 +1,68 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<parent>
|
||||
<groupId>com.alibaba.datax</groupId>
|
||||
<artifactId>datax-all</artifactId>
|
||||
<version>0.0.1-SNAPSHOT</version>
|
||||
</parent>
|
||||
|
||||
<artifactId>datax-example</artifactId>
|
||||
<packaging>pom</packaging>
|
||||
<modules>
|
||||
<module>datax-example-core</module>
|
||||
<module>datax-example-streamreader</module>
|
||||
<module>datax-example-neo4j</module>
|
||||
</modules>
|
||||
|
||||
<properties>
|
||||
<maven.compiler.source>8</maven.compiler.source>
|
||||
<maven.compiler.target>8</maven.compiler.target>
|
||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||
<junit4.version>4.13.2</junit4.version>
|
||||
</properties>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>com.alibaba.datax</groupId>
|
||||
<artifactId>datax-common</artifactId>
|
||||
<version>0.0.1-SNAPSHOT</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.alibaba.datax</groupId>
|
||||
<artifactId>datax-core</artifactId>
|
||||
<version>0.0.1-SNAPSHOT</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
<version>${junit4.version}</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
<build>
|
||||
<resources>
|
||||
<resource>
|
||||
<directory>src/main/resources</directory>
|
||||
<includes>
|
||||
<include>**/*.*</include>
|
||||
</includes>
|
||||
<filtering>true</filtering>
|
||||
</resource>
|
||||
</resources>
|
||||
<plugins>
|
||||
<!-- compiler plugin -->
|
||||
<plugin>
|
||||
<artifactId>maven-compiler-plugin</artifactId>
|
||||
<configuration>
|
||||
<source>${jdk-version}</source>
|
||||
<target>${jdk-version}</target>
|
||||
<encoding>${project-sourceEncoding}</encoding>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
|
||||
</project>
|
@ -447,6 +447,9 @@ DataX的内部类型在实现上会选用不同的java类型:
|
||||
3. 用户在插件中在`reader`/`writer`配置的`name`字段指定插件名字。框架根据插件的类型(`reader`/`writer`)和插件名称去插件的路径下扫描所有的jar,加入`classpath`。
|
||||
4. 根据插件配置中定义的入口类,框架通过反射实例化对应的`Job`和`Task`对象。
|
||||
|
||||
### 编写测试用例
|
||||
1. 在datax-example工程下新建新的插件测试模块,调用`ExampleContainer.start(jobPath)`方法来检测你的代码逻辑是否正确。[datax-example使用](https://github.com/alibaba/DataX/blob/master/datax-example/doc/README.md)
|
||||
|
||||
|
||||
## 三、Last but not Least
|
||||
|
||||
|
224
dorisreader/doc/dorisreader.md
Normal file
224
dorisreader/doc/dorisreader.md
Normal file
@ -0,0 +1,224 @@
|
||||
# DorisReader 插件文档
|
||||
|
||||
___
|
||||
|
||||
## 1 快速介绍
|
||||
|
||||
DorisReader插件实现了从Doris读取数据。在底层实现上,DorisReader通过JDBC连接远程Doris数据库,并执行相应的sql语句将数据从doris库中SELECT出来。
|
||||
|
||||
## 2 实现原理
|
||||
|
||||
简而言之,DorisReader通过JDBC连接器连接到远程的Doris数据库,并根据用户配置的信息生成查询SELECT
|
||||
SQL语句,然后发送到远程Doris数据库,并将该SQL执行返回结果使用DataX自定义的数据类型拼装为抽象的数据集,并传递给下游Writer处理。
|
||||
|
||||
对于用户配置Table、Column、Where的信息,DorisReader将其拼接为SQL语句发送到Doris数据库;对于用户配置querySql信息,DorisReader直接将其发送到Doris数据库。
|
||||
|
||||
## 3 功能说明
|
||||
|
||||
### 3.1 配置样例
|
||||
|
||||
* 配置一个从Doris数据库同步抽取数据到本地的作业:
|
||||
|
||||
```
|
||||
{
|
||||
"job": {
|
||||
"setting": {
|
||||
"speed": {
|
||||
"channel": 3
|
||||
},
|
||||
"errorLimit": {
|
||||
"record": 0,
|
||||
"percentage": 0.02
|
||||
}
|
||||
},
|
||||
"content": [
|
||||
{
|
||||
"reader": {
|
||||
"name": "dorisreader",
|
||||
"parameter": {
|
||||
"username": "root",
|
||||
"password": "root",
|
||||
"column": [
|
||||
"id",
|
||||
"name"
|
||||
],
|
||||
"splitPk": "db_id",
|
||||
"connection": [
|
||||
{
|
||||
"table": [
|
||||
"table"
|
||||
],
|
||||
"jdbcUrl": [
|
||||
"jdbc:Doris://127.0.0.1:9030/database"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"writer": {
|
||||
"name": "streamwriter",
|
||||
"parameter": {
|
||||
"print":true
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
* 配置一个自定义SQL的数据库同步任务到本地内容的作业:
|
||||
|
||||
```
|
||||
{
|
||||
"job": {
|
||||
"setting": {
|
||||
"speed": {
|
||||
"channel":1
|
||||
}
|
||||
},
|
||||
"content": [
|
||||
{
|
||||
"reader": {
|
||||
"name": "dorisreader",
|
||||
"parameter": {
|
||||
"username": "root",
|
||||
"password": "root",
|
||||
"connection": [
|
||||
{
|
||||
"querySql": [
|
||||
"select db_id,on_line_flag from db_info where db_id < 10;",
|
||||
"select db_id,on_line_flag from db_info where db_id >= 10;"
|
||||
|
||||
],
|
||||
"jdbcUrl": [
|
||||
"jdbc:Doris://127.0.0.1:9030/database"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"writer": {
|
||||
"name": "streamwriter",
|
||||
"parameter": {
|
||||
"print": false,
|
||||
"encoding": "UTF-8"
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### 3.2 参数说明
|
||||
|
||||
* **jdbcUrl**
|
||||
|
||||
*
|
||||
描述:描述的是到对端数据库的JDBC连接信息,使用JSON的数组描述,并支持一个库填写多个连接地址。之所以使用JSON数组描述连接信息,是因为阿里集团内部支持多个IP探测,如果配置了多个,DorisReader可以依次探测ip的可连接性,直到选择一个合法的IP。如果全部连接失败,DorisReader报错。
|
||||
注意,jdbcUrl必须包含在connection配置单元中。对于阿里集团外部使用情况,JSON数组填写一个JDBC连接即可。
|
||||
|
||||
* 必选:是 <br />
|
||||
|
||||
* 默认值:无 <br />
|
||||
|
||||
* **username**
|
||||
|
||||
* 描述:数据源的用户名 <br />
|
||||
|
||||
* 必选:是 <br />
|
||||
|
||||
* 默认值:无 <br />
|
||||
|
||||
* **password**
|
||||
|
||||
* 描述:数据源指定用户名的密码 <br />
|
||||
|
||||
* 必选:是 <br />
|
||||
|
||||
* 默认值:无 <br />
|
||||
|
||||
* **table**
|
||||
|
||||
*
|
||||
描述:所选取的需要同步的表。使用JSON的数组描述,因此支持多张表同时抽取。当配置为多张表时,用户自己需保证多张表是同一schema结构,DorisReader不予检查表是否同一逻辑表。注意,table必须包含在connection配置单元中。<br />
|
||||
|
||||
* 必选:是 <br />
|
||||
|
||||
* 默认值:无 <br />
|
||||
|
||||
* **column**
|
||||
|
||||
* 描述:所配置的表中需要同步的列名集合,使用JSON的数组描述字段信息。用户使用\*代表默认使用所有列配置,例如['\*']。
|
||||
|
||||
支持列裁剪,即列可以挑选部分列进行导出。
|
||||
|
||||
支持列换序,即列可以不按照表schema信息进行导出。
|
||||
|
||||
支持常量配置,用户需要按照Doris SQL语法格式:
|
||||
["id", "\`table\`", "1", "'bazhen.csy'", "null", "to_char(a + 1)", "2.3" , "true"]
|
||||
id为普通列名,\`table\`为包含保留字的列名,1为整形数字常量,'bazhen.csy'为字符串常量,null为空指针,to_char(a + 1)为表达式,2.3为浮点数,true为布尔值。
|
||||
|
||||
* 必选:是 <br />
|
||||
|
||||
* 默认值:无 <br />
|
||||
|
||||
* **splitPk**
|
||||
|
||||
* 描述:DorisReader进行数据抽取时,如果指定splitPk,表示用户希望使用splitPk代表的字段进行数据分片,DataX因此会启动并发任务进行数据同步,这样可以大大提供数据同步的效能。
|
||||
|
||||
推荐splitPk用户使用表主键,因为表主键通常情况下比较均匀,因此切分出来的分片也不容易出现数据热点。
|
||||
|
||||
目前splitPk仅支持整形数据切分,`不支持浮点、字符串、日期等其他类型`。如果用户指定其他非支持类型,DorisReader将报错!
|
||||
|
||||
如果splitPk不填写,包括不提供splitPk或者splitPk值为空,DataX视作使用单通道同步该表数据。
|
||||
|
||||
* 必选:否 <br />
|
||||
|
||||
* 默认值:空 <br />
|
||||
|
||||
* **where**
|
||||
|
||||
* 描述:筛选条件,DorisReader根据指定的column、table、where条件拼接SQL,并根据这个SQL进行数据抽取。在实际业务场景中,往往会选择当天的数据进行同步,可以将where条件指定为gmt_create >
|
||||
$bizdate 。注意:不可以将where条件指定为limit 10,limit不是SQL的合法where子句。<br />
|
||||
|
||||
where条件可以有效地进行业务增量同步。如果不填写where语句,包括不提供where的key或者value,DataX均视作同步全量数据。
|
||||
|
||||
* 必选:否 <br />
|
||||
|
||||
* 默认值:无 <br />
|
||||
|
||||
* **querySql**
|
||||
|
||||
*
|
||||
描述:在有些业务场景下,where这一配置项不足以描述所筛选的条件,用户可以通过该配置型来自定义筛选SQL。当用户配置了这一项之后,DataX系统就会忽略table,column这些配置型,直接使用这个配置项的内容对数据进行筛选,例如需要进行多表join后同步数据,使用select
|
||||
a,b from table_a join table_b on table_a.id = table_b.id <br />
|
||||
|
||||
`当用户配置querySql时,DorisReader直接忽略table、column、where条件的配置`,querySql优先级大于table、column、where选项。
|
||||
|
||||
* 必选:否 <br />
|
||||
|
||||
* 默认值:无 <br />
|
||||
|
||||
### 3.3 类型转换
|
||||
|
||||
目前DorisReader支持大部分Doris类型,但也存在部分个别类型没有支持的情况,请注意检查你的类型。
|
||||
|
||||
下面列出DorisReaderr针对Doris类型转换列表:
|
||||
|
||||
| DataX 内部类型| doris 数据类型 |
|
||||
| -------- |-------------------------------------------------------|
|
||||
| Long | int, tinyint, smallint, int, bigint,Largint |
|
||||
| Double | float, double, decimal |
|
||||
| String | varchar, char, text, string, map, json, array, struct |
|
||||
| Date | date, datetime |
|
||||
| Boolean | Boolean |
|
||||
|
||||
请注意:
|
||||
|
||||
* `tinyint(1) DataX视作为整形`。
|
||||
|
||||
|
||||
|
81
dorisreader/pom.xml
Executable file
81
dorisreader/pom.xml
Executable file
@ -0,0 +1,81 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<parent>
|
||||
<groupId>com.alibaba.datax</groupId>
|
||||
<artifactId>datax-all</artifactId>
|
||||
<version>0.0.1-SNAPSHOT</version>
|
||||
</parent>
|
||||
<artifactId>dorisreader</artifactId>
|
||||
<name>dorisreader</name>
|
||||
<packaging>jar</packaging>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>com.alibaba.datax</groupId>
|
||||
<artifactId>datax-common</artifactId>
|
||||
<version>${datax-project-version}</version>
|
||||
<exclusions>
|
||||
<exclusion>
|
||||
<artifactId>slf4j-log4j12</artifactId>
|
||||
<groupId>org.slf4j</groupId>
|
||||
</exclusion>
|
||||
</exclusions>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.slf4j</groupId>
|
||||
<artifactId>slf4j-api</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>ch.qos.logback</groupId>
|
||||
<artifactId>logback-classic</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.alibaba.datax</groupId>
|
||||
<artifactId>plugin-rdbms-util</artifactId>
|
||||
<version>${datax-project-version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>mysql</groupId>
|
||||
<artifactId>mysql-connector-java</artifactId>
|
||||
<version>${mysql.driver.version}</version>
|
||||
</dependency>
|
||||
|
||||
|
||||
</dependencies>
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
<!-- compiler plugin -->
|
||||
<plugin>
|
||||
<artifactId>maven-compiler-plugin</artifactId>
|
||||
<configuration>
|
||||
<source>${jdk-version}</source>
|
||||
<target>${jdk-version}</target>
|
||||
<encoding>${project-sourceEncoding}</encoding>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<!-- assembly plugin -->
|
||||
<plugin>
|
||||
<artifactId>maven-assembly-plugin</artifactId>
|
||||
<configuration>
|
||||
<descriptors>
|
||||
<descriptor>src/main/assembly/package.xml</descriptor>
|
||||
</descriptors>
|
||||
<finalName>datax</finalName>
|
||||
</configuration>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>dwzip</id>
|
||||
<phase>package</phase>
|
||||
<goals>
|
||||
<goal>single</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
</project>
|
35
dorisreader/src/main/assembly/package.xml
Executable file
35
dorisreader/src/main/assembly/package.xml
Executable file
@ -0,0 +1,35 @@
|
||||
<assembly
|
||||
xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0 http://maven.apache.org/xsd/assembly-1.1.0.xsd">
|
||||
<id></id>
|
||||
<formats>
|
||||
<format>dir</format>
|
||||
</formats>
|
||||
<includeBaseDirectory>false</includeBaseDirectory>
|
||||
<fileSets>
|
||||
<fileSet>
|
||||
<directory>src/main/resources</directory>
|
||||
<includes>
|
||||
<include>plugin.json</include>
|
||||
<include>plugin_job_template.json</include>
|
||||
</includes>
|
||||
<outputDirectory>plugin/reader/dorisreader</outputDirectory>
|
||||
</fileSet>
|
||||
<fileSet>
|
||||
<directory>target/</directory>
|
||||
<includes>
|
||||
<include>dorisreader-0.0.1-SNAPSHOT.jar</include>
|
||||
</includes>
|
||||
<outputDirectory>plugin/reader/dorisreader</outputDirectory>
|
||||
</fileSet>
|
||||
</fileSets>
|
||||
|
||||
<dependencySets>
|
||||
<dependencySet>
|
||||
<useProjectArtifact>false</useProjectArtifact>
|
||||
<outputDirectory>plugin/reader/dorisreader/libs</outputDirectory>
|
||||
<scope>runtime</scope>
|
||||
</dependencySet>
|
||||
</dependencySets>
|
||||
</assembly>
|
@ -0,0 +1,94 @@
|
||||
package com.alibaba.datax.plugin.reader.dorisreader;
|
||||
|
||||
import com.alibaba.datax.common.plugin.RecordSender;
|
||||
import com.alibaba.datax.common.spi.Reader;
|
||||
import com.alibaba.datax.common.util.Configuration;
|
||||
import com.alibaba.datax.plugin.rdbms.reader.CommonRdbmsReader;
|
||||
import com.alibaba.datax.plugin.rdbms.reader.Constant;
|
||||
import com.alibaba.datax.plugin.rdbms.util.DataBaseType;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public class DorisReader extends Reader {
|
||||
|
||||
private static final DataBaseType DATABASE_TYPE = DataBaseType.Doris;
|
||||
|
||||
public static class Job extends Reader.Job {
|
||||
private static final Logger LOG = LoggerFactory
|
||||
.getLogger(Job.class);
|
||||
|
||||
private Configuration originalConfig = null;
|
||||
private CommonRdbmsReader.Job commonRdbmsReaderJob;
|
||||
|
||||
@Override
|
||||
public void init() {
|
||||
this.originalConfig = super.getPluginJobConf();
|
||||
|
||||
Integer fetchSize = this.originalConfig.getInt(Constant.FETCH_SIZE,Integer.MIN_VALUE);
|
||||
this.originalConfig.set(Constant.FETCH_SIZE, fetchSize);
|
||||
|
||||
this.commonRdbmsReaderJob = new CommonRdbmsReader.Job(DATABASE_TYPE);
|
||||
this.commonRdbmsReaderJob.init(this.originalConfig);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void preCheck(){
|
||||
init();
|
||||
this.commonRdbmsReaderJob.preCheck(this.originalConfig,DATABASE_TYPE);
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<Configuration> split(int adviceNumber) {
|
||||
return this.commonRdbmsReaderJob.split(this.originalConfig, adviceNumber);
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void post() {
|
||||
this.commonRdbmsReaderJob.post(this.originalConfig);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void destroy() {
|
||||
this.commonRdbmsReaderJob.destroy(this.originalConfig);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public static class Task extends Reader.Task {
|
||||
|
||||
private Configuration readerSliceConfig;
|
||||
private CommonRdbmsReader.Task commonRdbmsReaderTask;
|
||||
|
||||
@Override
|
||||
public void init() {
|
||||
this.readerSliceConfig = super.getPluginJobConf();
|
||||
this.commonRdbmsReaderTask = new CommonRdbmsReader.Task(DATABASE_TYPE,super.getTaskGroupId(), super.getTaskId());
|
||||
this.commonRdbmsReaderTask.init(this.readerSliceConfig);
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void startRead(RecordSender recordSender) {
|
||||
int fetchSize = this.readerSliceConfig.getInt(Constant.FETCH_SIZE);
|
||||
|
||||
this.commonRdbmsReaderTask.startRead(this.readerSliceConfig, recordSender,
|
||||
super.getTaskPluginCollector(), fetchSize);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void post() {
|
||||
this.commonRdbmsReaderTask.post(this.readerSliceConfig);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void destroy() {
|
||||
this.commonRdbmsReaderTask.destroy(this.readerSliceConfig);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,31 @@
|
||||
package com.alibaba.datax.plugin.reader.dorisreader;
|
||||
|
||||
import com.alibaba.datax.common.spi.ErrorCode;
|
||||
|
||||
public enum DorisReaderErrorCode implements ErrorCode {
|
||||
;
|
||||
|
||||
private final String code;
|
||||
private final String description;
|
||||
|
||||
private DorisReaderErrorCode(String code, String description) {
|
||||
this.code = code;
|
||||
this.description = description;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getCode() {
|
||||
return this.code;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getDescription() {
|
||||
return this.description;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return String.format("Code:[%s], Description:[%s]. ", this.code,
|
||||
this.description);
|
||||
}
|
||||
}
|
6
dorisreader/src/main/resources/plugin.json
Executable file
6
dorisreader/src/main/resources/plugin.json
Executable file
@ -0,0 +1,6 @@
|
||||
{
|
||||
"name": "dorisreader",
|
||||
"class": "com.alibaba.datax.plugin.reader.dorisreader.DorisReader",
|
||||
"description": "useScene: prod. mechanism: Jdbc connection using the database, execute select sql, retrieve data from the ResultSet. warn: The more you know about the database, the less problems you encounter.",
|
||||
"developer": "alibaba"
|
||||
}
|
15
dorisreader/src/main/resources/plugin_job_template.json
Normal file
15
dorisreader/src/main/resources/plugin_job_template.json
Normal file
@ -0,0 +1,15 @@
|
||||
{
|
||||
"name": "dorisreader",
|
||||
"parameter": {
|
||||
"username": "",
|
||||
"password": "",
|
||||
"column": [],
|
||||
"connection": [
|
||||
{
|
||||
"jdbcUrl": [],
|
||||
"table": []
|
||||
}
|
||||
],
|
||||
"where": ""
|
||||
}
|
||||
}
|
@ -22,6 +22,7 @@ import java.net.HttpURLConnection;
|
||||
import java.net.URL;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
@ -97,7 +98,7 @@ public class DorisStreamLoadObserver {
|
||||
"could not get the final state of label[%s].\n", label), null);
|
||||
}
|
||||
Map<String, Object> result = (Map<String, Object>)JSON.parse(EntityUtils.toString(respEntity));
|
||||
String labelState = (String)result.get("state");
|
||||
String labelState = (String)result.get("data");
|
||||
if (null == labelState) {
|
||||
throw new IOException(String.format("Failed to flush data to Doris, Error " +
|
||||
"could not get the final state of label[%s]. response[%s]\n", label, EntityUtils.toString(respEntity)), null);
|
||||
@ -210,12 +211,10 @@ public class DorisStreamLoadObserver {
|
||||
|
||||
private String getLoadHost() {
|
||||
List<String> hostList = options.getLoadUrlList();
|
||||
long tmp = pos + hostList.size();
|
||||
for (; pos < tmp; pos++) {
|
||||
String host = new StringBuilder("http://").append(hostList.get((int) (pos % hostList.size()))).toString();
|
||||
if (checkConnection(host)) {
|
||||
return host;
|
||||
}
|
||||
Collections.shuffle(hostList);
|
||||
String host = new StringBuilder("http://").append(hostList.get((0))).toString();
|
||||
if (checkConnection(host)){
|
||||
return host;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
@ -45,7 +45,7 @@
|
||||
<dependency>
|
||||
<groupId>com.jcraft</groupId>
|
||||
<artifactId>jsch</artifactId>
|
||||
<version>0.1.51</version>
|
||||
<version>0.1.54</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>commons-net</groupId>
|
||||
@ -89,4 +89,4 @@
|
||||
</plugins>
|
||||
</build>
|
||||
|
||||
</project>
|
||||
</project>
|
||||
|
@ -64,6 +64,8 @@ public class SftpHelper extends FtpHelper {
|
||||
String message = String.format("请确认连接ftp服务器端口是否正确,错误的端口: [%s] ", port);
|
||||
LOG.error(message);
|
||||
throw DataXException.asDataXException(FtpReaderErrorCode.FAIL_LOGIN, message, e);
|
||||
}else{
|
||||
throw DataXException.asDataXException(FtpReaderErrorCode.COMMAND_FTP_IO_EXCEPTION, "", e);
|
||||
}
|
||||
}else {
|
||||
if("Auth fail".equals(e.getMessage())){
|
||||
|
@ -45,7 +45,7 @@
|
||||
<dependency>
|
||||
<groupId>com.jcraft</groupId>
|
||||
<artifactId>jsch</artifactId>
|
||||
<version>0.1.51</version>
|
||||
<version>0.1.54</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>commons-net</groupId>
|
||||
|
297
gaussdbreader/doc/gaussdbreader.md
Normal file
297
gaussdbreader/doc/gaussdbreader.md
Normal file
@ -0,0 +1,297 @@
|
||||
|
||||
# GaussDbReader 插件文档
|
||||
|
||||
|
||||
___
|
||||
|
||||
|
||||
## 1 快速介绍
|
||||
|
||||
GaussDbReader插件实现了从GaussDB读取数据。在底层实现上,GaussDbReader通过JDBC连接远程GaussDB数据库,并执行相应的sql语句将数据从GaussDB库中SELECT出来。
|
||||
|
||||
## 2 实现原理
|
||||
|
||||
简而言之,GaussDbReader通过JDBC连接器连接到远程的GaussDB数据库,并根据用户配置的信息生成查询SELECT SQL语句并发送到远程GaussDB数据库,并将该SQL执行返回结果使用DataX自定义的数据类型拼装为抽象的数据集,并传递给下游Writer处理。
|
||||
|
||||
对于用户配置Table、Column、Where的信息,GaussDbReader将其拼接为SQL语句发送到GaussDB数据库;对于用户配置querySql信息,GaussDbReader直接将其发送到GaussDB数据库。
|
||||
|
||||
|
||||
## 3 功能说明
|
||||
|
||||
### 3.1 配置样例
|
||||
|
||||
* 配置一个从GaussDB数据库同步抽取数据到本地的作业:
|
||||
|
||||
```
|
||||
{
|
||||
"job": {
|
||||
"setting": {
|
||||
"speed": {
|
||||
//设置传输速度,单位为byte/s,DataX运行会尽可能达到该速度但是不超过它.
|
||||
"byte": 1048576
|
||||
},
|
||||
//出错限制
|
||||
"errorLimit": {
|
||||
//出错的record条数上限,当大于该值即报错。
|
||||
"record": 0,
|
||||
//出错的record百分比上限 1.0表示100%,0.02表示2%
|
||||
"percentage": 0.02
|
||||
}
|
||||
},
|
||||
"content": [
|
||||
{
|
||||
"reader": {
|
||||
"name": "gaussdbreader",
|
||||
"parameter": {
|
||||
// 数据库连接用户名
|
||||
"username": "xx",
|
||||
// 数据库连接密码
|
||||
"password": "xx",
|
||||
"column": [
|
||||
"id","name"
|
||||
],
|
||||
//切分主键
|
||||
"splitPk": "id",
|
||||
"connection": [
|
||||
{
|
||||
"table": [
|
||||
"table"
|
||||
],
|
||||
"jdbcUrl": [
|
||||
"jdbc:opengauss://host:port/database"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"writer": {
|
||||
//writer类型
|
||||
"name": "streamwriter",
|
||||
//是否打印内容
|
||||
"parameter": {
|
||||
"print":true,
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
* 配置一个自定义SQL的数据库同步任务到本地内容的作业:
|
||||
|
||||
```json
|
||||
{
|
||||
"job": {
|
||||
"setting": {
|
||||
"speed": 1048576
|
||||
},
|
||||
"content": [
|
||||
{
|
||||
"reader": {
|
||||
"name": "gaussdbreader",
|
||||
"parameter": {
|
||||
"username": "xx",
|
||||
"password": "xx",
|
||||
"where": "",
|
||||
"connection": [
|
||||
{
|
||||
"querySql": [
|
||||
"select db_id,on_line_flag from db_info where db_id < 10;"
|
||||
],
|
||||
"jdbcUrl": [
|
||||
"jdbc:opengauss://host:port/database", "jdbc:opengauss://host:port/database"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"writer": {
|
||||
"name": "streamwriter",
|
||||
"parameter": {
|
||||
"print": false,
|
||||
"encoding": "UTF-8"
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
|
||||
### 3.2 参数说明
|
||||
|
||||
* **jdbcUrl**
|
||||
|
||||
* 描述:描述的是到对端数据库的JDBC连接信息,使用JSON的数组描述,并支持一个库填写多个连接地址。之所以使用JSON数组描述连接信息,是因为阿里集团内部支持多个IP探测,如果配置了多个,GaussDbReader可以依次探测ip的可连接性,直到选择一个合法的IP。如果全部连接失败,GaussDbReader报错。 注意,jdbcUrl必须包含在connection配置单元中。对于阿里集团外部使用情况,JSON数组填写一个JDBC连接即可。
|
||||
|
||||
jdbcUrl按照GaussDB官方规范,并可以填写连接附件控制信息。具体请参看[GaussDB官方文档](https://docs.opengauss.org/zh/docs/3.1.0/docs/Developerguide/java-sql-Connection.html)。
|
||||
|
||||
* 必选:是 <br />
|
||||
|
||||
* 默认值:无 <br />
|
||||
|
||||
* **username**
|
||||
|
||||
* 描述:数据源的用户名 <br />
|
||||
|
||||
* 必选:是 <br />
|
||||
|
||||
* 默认值:无 <br />
|
||||
|
||||
* **password**
|
||||
|
||||
* 描述:数据源指定用户名的密码 <br />
|
||||
|
||||
* 必选:是 <br />
|
||||
|
||||
* 默认值:无 <br />
|
||||
|
||||
* **table**
|
||||
|
||||
* 描述:所选取的需要同步的表。使用JSON的数组描述,因此支持多张表同时抽取。当配置为多张表时,用户自己需保证多张表是同一schema结构,GaussDbReader不予检查表是否同一逻辑表。注意,table必须包含在connection配置单元中。<br />
|
||||
|
||||
* 必选:是 <br />
|
||||
|
||||
* 默认值:无 <br />
|
||||
|
||||
* **column**
|
||||
|
||||
* 描述:所配置的表中需要同步的列名集合,使用JSON的数组描述字段信息。用户使用\*代表默认使用所有列配置,例如['\*']。
|
||||
|
||||
支持列裁剪,即列可以挑选部分列进行导出。
|
||||
|
||||
支持列换序,即列可以不按照表schema信息进行导出。
|
||||
|
||||
支持常量配置,用户需要按照GaussDB语法格式:
|
||||
["id", "'hello'::varchar", "true", "2.5::real", "power(2,3)"]
|
||||
id为普通列名,'hello'::varchar为字符串常量,true为布尔值,2.5为浮点数, power(2,3)为函数。
|
||||
|
||||
**column必须用户显示指定同步的列集合,不允许为空!**
|
||||
|
||||
* 必选:是 <br />
|
||||
|
||||
* 默认值:无 <br />
|
||||
|
||||
* **splitPk**
|
||||
|
||||
* 描述:GaussDbReader进行数据抽取时,如果指定splitPk,表示用户希望使用splitPk代表的字段进行数据分片,DataX因此会启动并发任务进行数据同步,这样可以大大提高数据同步的效能。
|
||||
|
||||
推荐splitPk用户使用表主键,因为表主键通常情况下比较均匀,因此切分出来的分片也不容易出现数据热点。
|
||||
|
||||
目前splitPk仅支持整形数据切分,`不支持浮点、字符串型、日期等其他类型`。如果用户指定其他非支持类型,GaussDbReader将报错!
|
||||
|
||||
splitPk设置为空,底层将视作用户不允许对单表进行切分,因此使用单通道进行抽取。
|
||||
|
||||
* 必选:否 <br />
|
||||
|
||||
* 默认值:空 <br />
|
||||
|
||||
* **where**
|
||||
|
||||
* 描述:筛选条件,GaussDbReader根据指定的column、table、where条件拼接SQL,并根据这个SQL进行数据抽取。在实际业务场景中,往往会选择当天的数据进行同步,可以将where条件指定为gmt_create > $bizdate 。注意:不可以将where条件指定为limit 10,limit不是SQL的合法where子句。<br />
|
||||
|
||||
where条件可以有效地进行业务增量同步。 where条件不配置或者为空,视作全表同步数据。
|
||||
|
||||
* 必选:否 <br />
|
||||
|
||||
* 默认值:无 <br />
|
||||
|
||||
* **querySql**
|
||||
|
||||
* 描述:在有些业务场景下,where这一配置项不足以描述所筛选的条件,用户可以通过该配置型来自定义筛选SQL。当用户配置了这一项之后,DataX系统就会忽略table,column这些配置型,直接使用这个配置项的内容对数据进行筛选,例如需要进行多表join后同步数据,使用select a,b from table_a join table_b on table_a.id = table_b.id <br />
|
||||
|
||||
`当用户配置querySql时,GaussDbReader直接忽略table、column、where条件的配置`。
|
||||
|
||||
* 必选:否 <br />
|
||||
|
||||
* 默认值:无 <br />
|
||||
|
||||
* **fetchSize**
|
||||
|
||||
* 描述:该配置项定义了插件和数据库服务器端每次批量数据获取条数,该值决定了DataX和服务器端的网络交互次数,能够较大的提升数据抽取性能。<br />
|
||||
|
||||
`注意,该值过大(>2048)可能造成DataX进程OOM。`。
|
||||
|
||||
* 必选:否 <br />
|
||||
|
||||
* 默认值:1024 <br />
|
||||
|
||||
|
||||
### 3.3 类型转换
|
||||
|
||||
目前GaussDbReader支持大部分GaussDB类型,但也存在部分个别类型没有支持的情况,请注意检查你的类型。
|
||||
|
||||
下面列出GaussDbReader针对GaussDB类型转换列表:
|
||||
|
||||
|
||||
| DataX 内部类型| GaussDB 数据类型 |
|
||||
| -------- | ----- |
|
||||
| Long |bigint, bigserial, integer, smallint, serial |
|
||||
| Double |double precision, money, numeric, real |
|
||||
| String |varchar, char, text, bit, inet|
|
||||
| Date |date, time, timestamp |
|
||||
| Boolean |bool|
|
||||
| Bytes |bytea|
|
||||
|
||||
请注意:
|
||||
|
||||
* `除上述罗列字段类型外,其他类型均不支持; money,inet,bit需用户使用a_inet::varchar类似的语法转换`。
|
||||
|
||||
## 4 性能报告
|
||||
|
||||
### 4.1 环境准备
|
||||
|
||||
#### 4.1.1 数据特征
|
||||
建表语句:
|
||||
|
||||
create table pref_test(
|
||||
id serial,
|
||||
a_bigint bigint,
|
||||
a_bit bit(10),
|
||||
a_boolean boolean,
|
||||
a_char character(5),
|
||||
a_date date,
|
||||
a_double double precision,
|
||||
a_integer integer,
|
||||
a_money money,
|
||||
a_num numeric(10,2),
|
||||
a_real real,
|
||||
a_smallint smallint,
|
||||
a_text text,
|
||||
a_time time,
|
||||
a_timestamp timestamp
|
||||
)
|
||||
|
||||
#### 4.1.2 机器参数
|
||||
|
||||
* 执行DataX的机器参数为:
|
||||
1. cpu: 16核 Intel(R) Xeon(R) CPU E5620 @ 2.40GHz
|
||||
2. mem: MemTotal: 24676836kB MemFree: 6365080kB
|
||||
3. net: 百兆双网卡
|
||||
|
||||
* GaussDB数据库机器参数为:
|
||||
D12 24逻辑核 192G内存 12*480G SSD 阵列
|
||||
|
||||
|
||||
### 4.2 测试报告
|
||||
|
||||
#### 4.2.1 单表测试报告
|
||||
|
||||
|
||||
| 通道数 | 是否按照主键切分 | DataX速度(Rec/s) | DataX流量(MB/s) | DataX机器运行负载 |
|
||||
|--------|--------| --------|--------|--------|
|
||||
|1| 否 | 10211 | 0.63 | 0.2 |
|
||||
|1| 是 | 10211 | 0.63 | 0.2 |
|
||||
|4| 否 | 10211 | 0.63 | 0.2 |
|
||||
|4| 是 | 40000 | 2.48 | 0.5 |
|
||||
|8| 否 | 10211 | 0.63 | 0.2 |
|
||||
|8| 是 | 78048 | 4.84 | 0.8 |
|
||||
|
||||
|
||||
说明:
|
||||
|
||||
1. 这里的单表,主键类型为 serial,数据分布均匀。
|
||||
2. 对单表如果没有按照主键切分,那么配置通道个数不会提升速度,效果与1个通道一样。
|
86
gaussdbreader/pom.xml
Normal file
86
gaussdbreader/pom.xml
Normal file
@ -0,0 +1,86 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<parent>
|
||||
<artifactId>datax-all</artifactId>
|
||||
<groupId>com.alibaba.datax</groupId>
|
||||
<version>0.0.1-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<artifactId>gaussdbreader</artifactId>
|
||||
<name>gaussdbreader</name>
|
||||
<packaging>jar</packaging>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>com.alibaba.datax</groupId>
|
||||
<artifactId>datax-common</artifactId>
|
||||
<version>${datax-project-version}</version>
|
||||
<exclusions>
|
||||
<exclusion>
|
||||
<artifactId>slf4j-log4j12</artifactId>
|
||||
<groupId>org.slf4j</groupId>
|
||||
</exclusion>
|
||||
</exclusions>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.slf4j</groupId>
|
||||
<artifactId>slf4j-api</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>ch.qos.logback</groupId>
|
||||
<artifactId>logback-classic</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.alibaba.datax</groupId>
|
||||
<artifactId>plugin-rdbms-util</artifactId>
|
||||
<version>${datax-project-version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.opengauss</groupId>
|
||||
<artifactId>opengauss-jdbc</artifactId>
|
||||
<version>3.0.0</version>
|
||||
</dependency>
|
||||
|
||||
</dependencies>
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
<!-- compiler plugin -->
|
||||
<plugin>
|
||||
<artifactId>maven-compiler-plugin</artifactId>
|
||||
<configuration>
|
||||
<source>${jdk-version}</source>
|
||||
<target>${jdk-version}</target>
|
||||
<encoding>${project-sourceEncoding}</encoding>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<!-- assembly plugin -->
|
||||
<plugin>
|
||||
<artifactId>maven-assembly-plugin</artifactId>
|
||||
<configuration>
|
||||
<descriptors>
|
||||
<descriptor>src/main/assembly/package.xml</descriptor>
|
||||
</descriptors>
|
||||
<finalName>datax</finalName>
|
||||
</configuration>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>dwzip</id>
|
||||
<phase>package</phase>
|
||||
<goals>
|
||||
<goal>single</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
|
||||
</project>
|
35
gaussdbreader/src/main/assembly/package.xml
Executable file
35
gaussdbreader/src/main/assembly/package.xml
Executable file
@ -0,0 +1,35 @@
|
||||
<assembly
|
||||
xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0 http://maven.apache.org/xsd/assembly-1.1.0.xsd">
|
||||
<id></id>
|
||||
<formats>
|
||||
<format>dir</format>
|
||||
</formats>
|
||||
<includeBaseDirectory>false</includeBaseDirectory>
|
||||
<fileSets>
|
||||
<fileSet>
|
||||
<directory>src/main/resources</directory>
|
||||
<includes>
|
||||
<include>plugin.json</include>
|
||||
<include>plugin_job_template.json</include>
|
||||
</includes>
|
||||
<outputDirectory>plugin/reader/gaussdbreader</outputDirectory>
|
||||
</fileSet>
|
||||
<fileSet>
|
||||
<directory>target/</directory>
|
||||
<includes>
|
||||
<include>gaussdbreader-0.0.1-SNAPSHOT.jar</include>
|
||||
</includes>
|
||||
<outputDirectory>plugin/reader/gaussdbreader</outputDirectory>
|
||||
</fileSet>
|
||||
</fileSets>
|
||||
|
||||
<dependencySets>
|
||||
<dependencySet>
|
||||
<useProjectArtifact>false</useProjectArtifact>
|
||||
<outputDirectory>plugin/reader/gaussdbreader/libs</outputDirectory>
|
||||
<scope>runtime</scope>
|
||||
</dependencySet>
|
||||
</dependencySets>
|
||||
</assembly>
|
@ -0,0 +1,7 @@
|
||||
package com.alibaba.datax.plugin.reader.gaussdbreader;
|
||||
|
||||
public class Constant {
|
||||
|
||||
public static final int DEFAULT_FETCH_SIZE = 1000;
|
||||
|
||||
}
|
@ -0,0 +1,86 @@
|
||||
package com.alibaba.datax.plugin.reader.gaussdbreader;
|
||||
|
||||
import com.alibaba.datax.common.exception.DataXException;
|
||||
import com.alibaba.datax.common.plugin.RecordSender;
|
||||
import com.alibaba.datax.common.spi.Reader;
|
||||
import com.alibaba.datax.common.util.Configuration;
|
||||
import com.alibaba.datax.plugin.rdbms.reader.CommonRdbmsReader;
|
||||
import com.alibaba.datax.plugin.rdbms.util.DBUtilErrorCode;
|
||||
import com.alibaba.datax.plugin.rdbms.util.DataBaseType;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public class GaussDbReader extends Reader {
|
||||
|
||||
private static final DataBaseType DATABASE_TYPE = DataBaseType.GaussDB;
|
||||
|
||||
public static class Job extends Reader.Job {
|
||||
|
||||
private Configuration originalConfig;
|
||||
private CommonRdbmsReader.Job commonRdbmsReaderMaster;
|
||||
|
||||
@Override
|
||||
public void init() {
|
||||
this.originalConfig = super.getPluginJobConf();
|
||||
int fetchSize = this.originalConfig.getInt(com.alibaba.datax.plugin.rdbms.reader.Constant.FETCH_SIZE,
|
||||
Constant.DEFAULT_FETCH_SIZE);
|
||||
if (fetchSize < 1) {
|
||||
throw DataXException.asDataXException(DBUtilErrorCode.REQUIRED_VALUE,
|
||||
String.format("您配置的fetchSize有误,根据DataX的设计,fetchSize : [%d] 设置值不能小于 1.", fetchSize));
|
||||
}
|
||||
this.originalConfig.set(com.alibaba.datax.plugin.rdbms.reader.Constant.FETCH_SIZE, fetchSize);
|
||||
|
||||
this.commonRdbmsReaderMaster = new CommonRdbmsReader.Job(DATABASE_TYPE);
|
||||
this.commonRdbmsReaderMaster.init(this.originalConfig);
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<Configuration> split(int adviceNumber) {
|
||||
return this.commonRdbmsReaderMaster.split(this.originalConfig, adviceNumber);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void post() {
|
||||
this.commonRdbmsReaderMaster.post(this.originalConfig);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void destroy() {
|
||||
this.commonRdbmsReaderMaster.destroy(this.originalConfig);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public static class Task extends Reader.Task {
|
||||
|
||||
private Configuration readerSliceConfig;
|
||||
private CommonRdbmsReader.Task commonRdbmsReaderSlave;
|
||||
|
||||
@Override
|
||||
public void init() {
|
||||
this.readerSliceConfig = super.getPluginJobConf();
|
||||
this.commonRdbmsReaderSlave = new CommonRdbmsReader.Task(DATABASE_TYPE,super.getTaskGroupId(), super.getTaskId());
|
||||
this.commonRdbmsReaderSlave.init(this.readerSliceConfig);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void startRead(RecordSender recordSender) {
|
||||
int fetchSize = this.readerSliceConfig.getInt(com.alibaba.datax.plugin.rdbms.reader.Constant.FETCH_SIZE);
|
||||
|
||||
this.commonRdbmsReaderSlave.startRead(this.readerSliceConfig, recordSender,
|
||||
super.getTaskPluginCollector(), fetchSize);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void post() {
|
||||
this.commonRdbmsReaderSlave.post(this.readerSliceConfig);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void destroy() {
|
||||
this.commonRdbmsReaderSlave.destroy(this.readerSliceConfig);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
6
gaussdbreader/src/main/resources/plugin.json
Executable file
6
gaussdbreader/src/main/resources/plugin.json
Executable file
@ -0,0 +1,6 @@
|
||||
{
|
||||
"name": "gaussdbreader",
|
||||
"class": "com.alibaba.datax.plugin.reader.gaussdbreader.GaussDbReader",
|
||||
"description": "useScene: prod. mechanism: Jdbc connection using the database, execute select sql, retrieve data from the ResultSet. warn: The more you know about the database, the less problems you encounter.",
|
||||
"developer": "alibaba"
|
||||
}
|
13
gaussdbreader/src/main/resources/plugin_job_template.json
Normal file
13
gaussdbreader/src/main/resources/plugin_job_template.json
Normal file
@ -0,0 +1,13 @@
|
||||
{
|
||||
"name": "gaussdbreader",
|
||||
"parameter": {
|
||||
"username": "",
|
||||
"password": "",
|
||||
"connection": [
|
||||
{
|
||||
"table": [],
|
||||
"jdbcUrl": []
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
267
gaussdbwriter/doc/gaussdbwriter.md
Normal file
267
gaussdbwriter/doc/gaussdbwriter.md
Normal file
@ -0,0 +1,267 @@
|
||||
# DataX GaussDbWriter
|
||||
|
||||
|
||||
---
|
||||
|
||||
|
||||
## 1 快速介绍
|
||||
|
||||
GaussDbWriter插件实现了写入数据到 GaussDB主库目的表的功能。在底层实现上,GaussDbWriter通过JDBC连接远程 GaussDB 数据库,并执行相应的 insert into ... sql 语句将数据写入 GaussDB,内部会分批次提交入库。
|
||||
|
||||
GaussDbWriter面向ETL开发工程师,他们使用GaussDbWriter从数仓导入数据到GaussDB。同时 GaussDbWriter亦可以作为数据迁移工具为DBA等用户提供服务。
|
||||
|
||||
|
||||
## 2 实现原理
|
||||
|
||||
GaussDbWriter通过 DataX 框架获取 Reader 生成的协议数据,根据你配置生成相应的SQL插入语句
|
||||
|
||||
|
||||
* `insert into...`(当主键/唯一性索引冲突时会写不进去冲突的行)
|
||||
|
||||
<br />
|
||||
|
||||
注意:
|
||||
1. 目的表所在数据库必须是主库才能写入数据;整个任务至少需具备 insert into...的权限,是否需要其他权限,取决于你任务配置中在 preSql 和 postSql 中指定的语句。
|
||||
2. GaussDbWriter和MysqlWriter不同,不支持配置writeMode参数。
|
||||
|
||||
|
||||
## 3 功能说明
|
||||
|
||||
### 3.1 配置样例
|
||||
|
||||
* 这里使用一份从内存产生到 GaussDbWriter导入的数据。
|
||||
|
||||
```json
|
||||
{
|
||||
"job": {
|
||||
"setting": {
|
||||
"speed": {
|
||||
"channel": 1
|
||||
}
|
||||
},
|
||||
"content": [
|
||||
{
|
||||
"reader": {
|
||||
"name": "streamreader",
|
||||
"parameter": {
|
||||
"column" : [
|
||||
{
|
||||
"value": "DataX",
|
||||
"type": "string"
|
||||
},
|
||||
{
|
||||
"value": 19880808,
|
||||
"type": "long"
|
||||
},
|
||||
{
|
||||
"value": "1988-08-08 08:08:08",
|
||||
"type": "date"
|
||||
},
|
||||
{
|
||||
"value": true,
|
||||
"type": "bool"
|
||||
},
|
||||
{
|
||||
"value": "test",
|
||||
"type": "bytes"
|
||||
}
|
||||
],
|
||||
"sliceRecordCount": 1000
|
||||
}
|
||||
},
|
||||
"writer": {
|
||||
"name": "gaussdbwriter",
|
||||
"parameter": {
|
||||
"username": "xx",
|
||||
"password": "xx",
|
||||
"column": [
|
||||
"id",
|
||||
"name"
|
||||
],
|
||||
"preSql": [
|
||||
"delete from test"
|
||||
],
|
||||
"connection": [
|
||||
{
|
||||
"jdbcUrl": "jdbc:opengauss://127.0.0.1:3002/datax",
|
||||
"table": [
|
||||
"test"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
|
||||
### 3.2 参数说明
|
||||
|
||||
* **jdbcUrl**
|
||||
|
||||
* 描述:目的数据库的 JDBC 连接信息 ,jdbcUrl必须包含在connection配置单元中。
|
||||
|
||||
注意:1、在一个数据库上只能配置一个值。
|
||||
2、jdbcUrl按照GaussDB官方规范,并可以填写连接附加参数信息。具体请参看GaussDB官方文档或者咨询对应 DBA。
|
||||
|
||||
|
||||
* 必选:是 <br />
|
||||
|
||||
* 默认值:无 <br />
|
||||
|
||||
* **username**
|
||||
|
||||
* 描述:目的数据库的用户名 <br />
|
||||
|
||||
* 必选:是 <br />
|
||||
|
||||
* 默认值:无 <br />
|
||||
|
||||
* **password**
|
||||
|
||||
* 描述:目的数据库的密码 <br />
|
||||
|
||||
* 必选:是 <br />
|
||||
|
||||
* 默认值:无 <br />
|
||||
|
||||
* **table**
|
||||
|
||||
* 描述:目的表的表名称。支持写入一个或者多个表。当配置为多张表时,必须确保所有表结构保持一致。
|
||||
|
||||
注意:table 和 jdbcUrl 必须包含在 connection 配置单元中
|
||||
|
||||
* 必选:是 <br />
|
||||
|
||||
* 默认值:无 <br />
|
||||
|
||||
* **column**
|
||||
|
||||
* 描述:目的表需要写入数据的字段,字段之间用英文逗号分隔。例如: "column": ["id","name","age"]。如果要依次写入全部列,使用\*表示, 例如: "column": ["\*"]
|
||||
|
||||
注意:1、我们强烈不推荐你这样配置,因为当你目的表字段个数、类型等有改动时,你的任务可能运行不正确或者失败
|
||||
2、此处 column 不能配置任何常量值
|
||||
|
||||
* 必选:是 <br />
|
||||
|
||||
* 默认值:否 <br />
|
||||
|
||||
* **preSql**
|
||||
|
||||
* 描述:写入数据到目的表前,会先执行这里的标准语句。如果 Sql 中有你需要操作到的表名称,请使用 `@table` 表示,这样在实际执行 Sql 语句时,会对变量按照实际表名称进行替换。比如你的任务是要写入到目的端的100个同构分表(表名称为:datax_00,datax01, ... datax_98,datax_99),并且你希望导入数据前,先对表中数据进行删除操作,那么你可以这样配置:`"preSql":["delete from @table"]`,效果是:在执行到每个表写入数据前,会先执行对应的 delete from 对应表名称 <br />
|
||||
|
||||
* 必选:否 <br />
|
||||
|
||||
* 默认值:无 <br />
|
||||
|
||||
* **postSql**
|
||||
|
||||
* 描述:写入数据到目的表后,会执行这里的标准语句。(原理同 preSql ) <br />
|
||||
|
||||
* 必选:否 <br />
|
||||
|
||||
* 默认值:无 <br />
|
||||
|
||||
* **batchSize**
|
||||
|
||||
* 描述:一次性批量提交的记录数大小,该值可以极大减少DataX与GaussDB的网络交互次数,并提升整体吞吐量。但是该值设置过大可能会造成DataX运行进程OOM情况。<br />
|
||||
|
||||
* 必选:否 <br />
|
||||
|
||||
* 默认值:1024 <br />
|
||||
|
||||
### 3.3 类型转换
|
||||
|
||||
目前 GaussDbWriter支持大部分 GaussDB类型,但也存在部分没有支持的情况,请注意检查你的类型。
|
||||
|
||||
下面列出 GaussDbWriter针对 GaussDB类型转换列表:
|
||||
|
||||
| DataX 内部类型| GaussDB 数据类型 |
|
||||
| -------- | ----- |
|
||||
| Long |bigint, bigserial, integer, smallint, serial |
|
||||
| Double |double precision, money, numeric, real |
|
||||
| String |varchar, char, text, bit|
|
||||
| Date |date, time, timestamp |
|
||||
| Boolean |bool|
|
||||
| Bytes |bytea|
|
||||
|
||||
## 4 性能报告
|
||||
|
||||
### 4.1 环境准备
|
||||
|
||||
#### 4.1.1 数据特征
|
||||
建表语句:
|
||||
|
||||
create table pref_test(
|
||||
id serial,
|
||||
a_bigint bigint,
|
||||
a_bit bit(10),
|
||||
a_boolean boolean,
|
||||
a_char character(5),
|
||||
a_date date,
|
||||
a_double double precision,
|
||||
a_integer integer,
|
||||
a_money money,
|
||||
a_num numeric(10,2),
|
||||
a_real real,
|
||||
a_smallint smallint,
|
||||
a_text text,
|
||||
a_time time,
|
||||
a_timestamp timestamp
|
||||
)
|
||||
|
||||
#### 4.1.2 机器参数
|
||||
|
||||
* 执行DataX的机器参数为:
|
||||
1. cpu: 16核 Intel(R) Xeon(R) CPU E5620 @ 2.40GHz
|
||||
2. mem: MemTotal: 24676836kB MemFree: 6365080kB
|
||||
3. net: 百兆双网卡
|
||||
|
||||
* GaussDB数据库机器参数为:
|
||||
D12 24逻辑核 192G内存 12*480G SSD 阵列
|
||||
|
||||
|
||||
### 4.2 测试报告
|
||||
|
||||
#### 4.2.1 单表测试报告
|
||||
|
||||
| 通道数| 批量提交batchSize | DataX速度(Rec/s)| DataX流量(M/s) | DataX机器运行负载
|
||||
|--------|--------| --------|--------|--------|--------|
|
||||
|1| 128 | 9259 | 0.55 | 0.3
|
||||
|1| 512 | 10869 | 0.653 | 0.3
|
||||
|1| 2048 | 9803 | 0.589 | 0.8
|
||||
|4| 128 | 30303 | 1.82 | 1
|
||||
|4| 512 | 36363 | 2.18 | 1
|
||||
|4| 2048 | 36363 | 2.18 | 1
|
||||
|8| 128 | 57142 | 3.43 | 2
|
||||
|8| 512 | 66666 | 4.01 | 1.5
|
||||
|8| 2048 | 66666 | 4.01 | 1.1
|
||||
|16| 128 | 88888 | 5.34 | 1.8
|
||||
|16| 2048 | 94117 | 5.65 | 2.5
|
||||
|32| 512 | 76190 | 4.58 | 3
|
||||
|
||||
#### 4.2.2 性能测试小结
|
||||
1. `channel数对性能影响很大`
|
||||
2. `通常不建议写入数据库时,通道个数 > 32`
|
||||
|
||||
|
||||
## FAQ
|
||||
|
||||
***
|
||||
|
||||
**Q: GaussDbWriter 执行 postSql 语句报错,那么数据导入到目标数据库了吗?**
|
||||
|
||||
A: DataX 导入过程存在三块逻辑,pre 操作、导入操作、post 操作,其中任意一环报错,DataX 作业报错。由于 DataX 不能保证在同一个事务完成上述几个操作,因此有可能数据已经落入到目标端。
|
||||
|
||||
***
|
||||
|
||||
**Q: 按照上述说法,那么有部分脏数据导入数据库,如果影响到线上数据库怎么办?**
|
||||
|
||||
A: 目前有两种解法,第一种配置 pre 语句,该 sql 可以清理当天导入数据, DataX 每次导入时候可以把上次清理干净并导入完整数据。
|
||||
第二种,向临时表导入数据,完成后再 rename 到线上表。
|
||||
|
||||
***
|
86
gaussdbwriter/pom.xml
Normal file
86
gaussdbwriter/pom.xml
Normal file
@ -0,0 +1,86 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<parent>
|
||||
<artifactId>datax-all</artifactId>
|
||||
<groupId>com.alibaba.datax</groupId>
|
||||
<version>0.0.1-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<artifactId>gaussdbwriter</artifactId>
|
||||
<name>gaussdbwriter</name>
|
||||
<packaging>jar</packaging>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>com.alibaba.datax</groupId>
|
||||
<artifactId>datax-common</artifactId>
|
||||
<version>${datax-project-version}</version>
|
||||
<exclusions>
|
||||
<exclusion>
|
||||
<artifactId>slf4j-log4j12</artifactId>
|
||||
<groupId>org.slf4j</groupId>
|
||||
</exclusion>
|
||||
</exclusions>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.slf4j</groupId>
|
||||
<artifactId>slf4j-api</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>ch.qos.logback</groupId>
|
||||
<artifactId>logback-classic</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.alibaba.datax</groupId>
|
||||
<artifactId>plugin-rdbms-util</artifactId>
|
||||
<version>${datax-project-version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.opengauss</groupId>
|
||||
<artifactId>opengauss-jdbc</artifactId>
|
||||
<version>3.0.0</version>
|
||||
</dependency>
|
||||
|
||||
</dependencies>
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
<!-- compiler plugin -->
|
||||
<plugin>
|
||||
<artifactId>maven-compiler-plugin</artifactId>
|
||||
<configuration>
|
||||
<source>${jdk-version}</source>
|
||||
<target>${jdk-version}</target>
|
||||
<encoding>${project-sourceEncoding}</encoding>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<!-- assembly plugin -->
|
||||
<plugin>
|
||||
<artifactId>maven-assembly-plugin</artifactId>
|
||||
<configuration>
|
||||
<descriptors>
|
||||
<descriptor>src/main/assembly/package.xml</descriptor>
|
||||
</descriptors>
|
||||
<finalName>datax</finalName>
|
||||
</configuration>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>dwzip</id>
|
||||
<phase>package</phase>
|
||||
<goals>
|
||||
<goal>single</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
|
||||
</project>
|
35
gaussdbwriter/src/main/assembly/package.xml
Executable file
35
gaussdbwriter/src/main/assembly/package.xml
Executable file
@ -0,0 +1,35 @@
|
||||
<assembly
|
||||
xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0 http://maven.apache.org/xsd/assembly-1.1.0.xsd">
|
||||
<id></id>
|
||||
<formats>
|
||||
<format>dir</format>
|
||||
</formats>
|
||||
<includeBaseDirectory>false</includeBaseDirectory>
|
||||
<fileSets>
|
||||
<fileSet>
|
||||
<directory>src/main/resources</directory>
|
||||
<includes>
|
||||
<include>plugin.json</include>
|
||||
<include>plugin_job_template.json</include>
|
||||
</includes>
|
||||
<outputDirectory>plugin/writer/gaussdbwriter</outputDirectory>
|
||||
</fileSet>
|
||||
<fileSet>
|
||||
<directory>target/</directory>
|
||||
<includes>
|
||||
<include>gaussdbwriter-0.0.1-SNAPSHOT.jar</include>
|
||||
</includes>
|
||||
<outputDirectory>plugin/writer/gaussdbwriter</outputDirectory>
|
||||
</fileSet>
|
||||
</fileSets>
|
||||
|
||||
<dependencySets>
|
||||
<dependencySet>
|
||||
<useProjectArtifact>false</useProjectArtifact>
|
||||
<outputDirectory>plugin/writer/gaussdbwriter/libs</outputDirectory>
|
||||
<scope>runtime</scope>
|
||||
</dependencySet>
|
||||
</dependencySets>
|
||||
</assembly>
|
@ -0,0 +1,103 @@
|
||||
package com.alibaba.datax.plugin.reader.gaussdbwriter;
|
||||
|
||||
import com.alibaba.datax.common.exception.DataXException;
|
||||
import com.alibaba.datax.common.plugin.RecordReceiver;
|
||||
import com.alibaba.datax.common.spi.Writer;
|
||||
import com.alibaba.datax.common.util.Configuration;
|
||||
import com.alibaba.datax.plugin.rdbms.util.DBUtilErrorCode;
|
||||
import com.alibaba.datax.plugin.rdbms.util.DataBaseType;
|
||||
import com.alibaba.datax.plugin.rdbms.writer.CommonRdbmsWriter;
|
||||
import com.alibaba.datax.plugin.rdbms.writer.Key;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public class GaussDbWriter extends Writer {
|
||||
|
||||
private static final DataBaseType DATABASE_TYPE = DataBaseType.GaussDB;
|
||||
|
||||
public static class Job extends Writer.Job {
|
||||
private Configuration originalConfig = null;
|
||||
private CommonRdbmsWriter.Job commonRdbmsWriterMaster;
|
||||
|
||||
@Override
|
||||
public void init() {
|
||||
this.originalConfig = super.getPluginJobConf();
|
||||
|
||||
// warn:not like mysql, GaussDB only support insert mode, don't use
|
||||
String writeMode = this.originalConfig.getString(Key.WRITE_MODE);
|
||||
if (null != writeMode) {
|
||||
throw DataXException.asDataXException(DBUtilErrorCode.CONF_ERROR,
|
||||
String.format("写入模式(writeMode)配置有误. 因为GaussDB不支持配置参数项 writeMode: %s, GaussDB仅使用insert sql 插入数据. 请检查您的配置并作出修改.", writeMode));
|
||||
}
|
||||
|
||||
this.commonRdbmsWriterMaster = new CommonRdbmsWriter.Job(DATABASE_TYPE);
|
||||
this.commonRdbmsWriterMaster.init(this.originalConfig);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void prepare() {
|
||||
this.commonRdbmsWriterMaster.prepare(this.originalConfig);
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<Configuration> split(int mandatoryNumber) {
|
||||
return this.commonRdbmsWriterMaster.split(this.originalConfig, mandatoryNumber);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void post() {
|
||||
this.commonRdbmsWriterMaster.post(this.originalConfig);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void destroy() {
|
||||
this.commonRdbmsWriterMaster.destroy(this.originalConfig);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public static class Task extends Writer.Task {
|
||||
private Configuration writerSliceConfig;
|
||||
private CommonRdbmsWriter.Task commonRdbmsWriterSlave;
|
||||
|
||||
@Override
|
||||
public void init() {
|
||||
this.writerSliceConfig = super.getPluginJobConf();
|
||||
this.commonRdbmsWriterSlave = new CommonRdbmsWriter.Task(DATABASE_TYPE){
|
||||
@Override
|
||||
public String calcValueHolder(String columnType){
|
||||
if("serial".equalsIgnoreCase(columnType)){
|
||||
return "?::int";
|
||||
}else if("bigserial".equalsIgnoreCase(columnType)){
|
||||
return "?::int8";
|
||||
}else if("bit".equalsIgnoreCase(columnType)){
|
||||
return "?::bit varying";
|
||||
}
|
||||
return "?::" + columnType;
|
||||
}
|
||||
};
|
||||
this.commonRdbmsWriterSlave.init(this.writerSliceConfig);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void prepare() {
|
||||
this.commonRdbmsWriterSlave.prepare(this.writerSliceConfig);
|
||||
}
|
||||
|
||||
public void startWrite(RecordReceiver recordReceiver) {
|
||||
this.commonRdbmsWriterSlave.startWrite(recordReceiver, this.writerSliceConfig, super.getTaskPluginCollector());
|
||||
}
|
||||
|
||||
@Override
|
||||
public void post() {
|
||||
this.commonRdbmsWriterSlave.post(this.writerSliceConfig);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void destroy() {
|
||||
this.commonRdbmsWriterSlave.destroy(this.writerSliceConfig);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
6
gaussdbwriter/src/main/resources/plugin.json
Executable file
6
gaussdbwriter/src/main/resources/plugin.json
Executable file
@ -0,0 +1,6 @@
|
||||
{
|
||||
"name": "gaussdbwriter",
|
||||
"class": "com.alibaba.datax.plugin.writer.gaussdbwriter.GaussDbWriter",
|
||||
"description": "useScene: prod. mechanism: Jdbc connection using the database, execute insert sql. warn: The more you know about the database, the less problems you encounter.",
|
||||
"developer": "alibaba"
|
||||
}
|
16
gaussdbwriter/src/main/resources/plugin_job_template.json
Normal file
16
gaussdbwriter/src/main/resources/plugin_job_template.json
Normal file
@ -0,0 +1,16 @@
|
||||
{
|
||||
"name": "gaussdbwriter",
|
||||
"parameter": {
|
||||
"username": "",
|
||||
"password": "",
|
||||
"column": [],
|
||||
"connection": [
|
||||
{
|
||||
"jdbcUrl": "",
|
||||
"table": []
|
||||
}
|
||||
],
|
||||
"preSql": [],
|
||||
"postSql": []
|
||||
}
|
||||
}
|
@ -60,12 +60,16 @@ hbase11xsqlreader插件实现了从Phoenix(HBase SQL)读取数据。在底层实
|
||||
//填写连接Phoenix的hbase集群zk地址
|
||||
"hbaseConfig": {
|
||||
"hbase.zookeeper.quorum": "hb-proxy-xxx-002.hbase.rds.aliyuncs.com,hb-proxy-xxx-001.hbase.rds.aliyuncs.com,hb-proxy-xxx-003.hbase.rds.aliyuncs.com"
|
||||
},
|
||||
},
|
||||
//填写要读取的phoenix的命名空间
|
||||
"schema": "TAG",
|
||||
//填写要读取的phoenix的表名
|
||||
"table": "US_POPULATION",
|
||||
//填写要读取的列名,不填读取所有列
|
||||
"column": [
|
||||
]
|
||||
],
|
||||
//查询条件
|
||||
"where": "id="
|
||||
}
|
||||
},
|
||||
"writer": {
|
||||
@ -92,11 +96,18 @@ hbase11xsqlreader插件实现了从Phoenix(HBase SQL)读取数据。在底层实
|
||||
|
||||
* 必选:是 <br />
|
||||
|
||||
* 默认值:无 <br />
|
||||
* **schema**
|
||||
|
||||
* 描述:编写Phoenix中的namespace,该值设置为''
|
||||
|
||||
* 必选:是 <br />
|
||||
|
||||
* 默认值:无 <br />
|
||||
|
||||
* **table**
|
||||
|
||||
* 描述:编写Phoenix中的表名,如果有namespace,该值设置为'namespace.tablename'
|
||||
* 描述:编写Phoenix中的表名,该值设置为'tablename'
|
||||
|
||||
* 必选:是 <br />
|
||||
|
||||
@ -109,7 +120,13 @@ hbase11xsqlreader插件实现了从Phoenix(HBase SQL)读取数据。在底层实
|
||||
* 必选:是 <br />
|
||||
|
||||
* 默认值:无 <br />
|
||||
* **where**
|
||||
|
||||
* 描述:填写需要从phoenix表中读取条件判断。
|
||||
|
||||
* 可选:是 <br />
|
||||
|
||||
* 默认值:无 <br />
|
||||
|
||||
### 3.3 类型转换
|
||||
|
||||
@ -172,11 +189,14 @@ hbase11xsqlreader插件实现了从Phoenix(HBase SQL)读取数据。在底层实
|
||||
"hbaseConfig": {
|
||||
"hbase.zookeeper.quorum": "hb-proxy-xxx-002.hbase.rds.aliyuncs.com,hb-proxy-xxx-001.hbase.rds.aliyuncs.com,hb-proxy-xxx-003.hbase.rds.aliyuncs.com"
|
||||
},
|
||||
"schema": "TAG",
|
||||
//填写要读取的phoenix的表名
|
||||
"table": "US_POPULATION",
|
||||
//填写要读取的列名,不填读取所有列
|
||||
"column": [
|
||||
]
|
||||
],
|
||||
//查询条件
|
||||
"where": "id="
|
||||
}
|
||||
},
|
||||
"writer": {
|
||||
@ -204,7 +224,13 @@ hbase11xsqlreader插件实现了从Phoenix(HBase SQL)读取数据。在底层实
|
||||
* 必选:是 <br />
|
||||
|
||||
* 默认值:无 <br />
|
||||
|
||||
* **schema**
|
||||
|
||||
* 描述:编写Phoenix中的namespace,该值设置为''
|
||||
|
||||
* 必选:是 <br />
|
||||
|
||||
* 默认值:无 <br />
|
||||
* **table**
|
||||
|
||||
* 描述:编写Phoenix中的表名,如果有namespace,该值设置为'namespace.tablename'
|
||||
@ -220,7 +246,13 @@ hbase11xsqlreader插件实现了从Phoenix(HBase SQL)读取数据。在底层实
|
||||
* 必选:是 <br />
|
||||
|
||||
* 默认值:无 <br />
|
||||
* **where**
|
||||
|
||||
* 描述:填写需要从phoenix表中读取条件判断。
|
||||
|
||||
* 可选:是 <br />
|
||||
|
||||
* 默认值:无 <br />
|
||||
|
||||
### 3.3 类型转换
|
||||
|
||||
|
@ -26,9 +26,7 @@ import java.io.IOException;
|
||||
import java.sql.Connection;
|
||||
import java.sql.DriverManager;
|
||||
import java.sql.SQLException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.*;
|
||||
|
||||
|
||||
public class HbaseSQLHelper {
|
||||
@ -50,11 +48,15 @@ public class HbaseSQLHelper {
|
||||
String zkUrl = readerConfig.getZkUrl();
|
||||
|
||||
PhoenixConfigurationUtil.setInputClass(conf, PhoenixRecordWritable.class);
|
||||
PhoenixConfigurationUtil.setInputTableName(conf, table);
|
||||
|
||||
PhoenixConfigurationUtil.setInputTableName(conf, readerConfig.getSchema()+"."+table);
|
||||
|
||||
if (!columns.isEmpty()) {
|
||||
PhoenixConfigurationUtil.setSelectColumnNames(conf, columns.toArray(new String[columns.size()]));
|
||||
}
|
||||
if(Objects.nonNull(readerConfig.getWhere())){
|
||||
PhoenixConfigurationUtil.setInputTableConditions(conf,readerConfig.getWhere());
|
||||
}
|
||||
PhoenixEmbeddedDriver.ConnectionInfo info = null;
|
||||
try {
|
||||
info = PhoenixEmbeddedDriver.ConnectionInfo.create(zkUrl);
|
||||
@ -67,15 +69,19 @@ public class HbaseSQLHelper {
|
||||
conf.setInt(HConstants.ZOOKEEPER_CLIENT_PORT, info.getPort());
|
||||
if (info.getRootNode() != null)
|
||||
conf.set(HConstants.ZOOKEEPER_ZNODE_PARENT, info.getRootNode());
|
||||
conf.set(Key.NAME_SPACE_MAPPING_ENABLED,"true");
|
||||
conf.set(Key.SYSTEM_TABLES_TO_NAMESPACE,"true");
|
||||
return conf;
|
||||
}
|
||||
|
||||
public static List<String> getPColumnNames(String connectionString, String tableName) throws SQLException {
|
||||
Connection con =
|
||||
DriverManager.getConnection(connectionString);
|
||||
public static List<String> getPColumnNames(String connectionString, String tableName,String schema) throws SQLException {
|
||||
Properties pro = new Properties();
|
||||
pro.put(Key.NAME_SPACE_MAPPING_ENABLED, true);
|
||||
pro.put(Key.SYSTEM_TABLES_TO_NAMESPACE, true);
|
||||
Connection con = DriverManager.getConnection(connectionString,pro);
|
||||
PhoenixConnection phoenixConnection = con.unwrap(PhoenixConnection.class);
|
||||
MetaDataClient metaDataClient = new MetaDataClient(phoenixConnection);
|
||||
PTable table = metaDataClient.updateCache("", tableName).getTable();
|
||||
PTable table = metaDataClient.updateCache(schema, tableName).getTable();
|
||||
List<String> columnNames = new ArrayList<String>();
|
||||
for (PColumn pColumn : table.getColumns()) {
|
||||
if (!pColumn.getName().getString().equals(SaltingUtil.SALTING_COLUMN_NAME))
|
||||
|
@ -9,6 +9,7 @@ import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.sql.SQLException;
|
||||
import java.util.List;
|
||||
import java.util.StringJoiner;
|
||||
|
||||
public class HbaseSQLReaderConfig {
|
||||
private final static Logger LOG = LoggerFactory.getLogger(HbaseSQLReaderConfig.class);
|
||||
@ -27,6 +28,9 @@ public class HbaseSQLReaderConfig {
|
||||
private String tableName;
|
||||
private List<String> columns; // 目的表的所有列的列名,包括主键和非主键,不包括时间列
|
||||
|
||||
private String where;//条件
|
||||
|
||||
private String schema;//
|
||||
/**
|
||||
* @return 获取原始的datax配置
|
||||
*/
|
||||
@ -96,22 +100,27 @@ public class HbaseSQLReaderConfig {
|
||||
}
|
||||
String zkQuorum = zkCfg.getFirst();
|
||||
String znode = zkCfg.getSecond();
|
||||
|
||||
if (zkQuorum == null || zkQuorum.isEmpty()) {
|
||||
throw DataXException.asDataXException(
|
||||
HbaseSQLReaderErrorCode.ILLEGAL_VALUE, "HBase的hbase.zookeeper.quorum配置不能为空" );
|
||||
}
|
||||
// 生成sql使用的连接字符串, 格式: jdbc:hbase:zk_quorum:2181:/znode_parent
|
||||
cfg.connectionString = "jdbc:phoenix:" + zkQuorum;
|
||||
cfg.zkUrl = zkQuorum + ":2181";
|
||||
StringBuilder connectionString=new StringBuilder("jdbc:phoenix:");
|
||||
connectionString.append(zkQuorum);
|
||||
cfg.connectionString = connectionString.toString();
|
||||
StringBuilder zkUrl =new StringBuilder(zkQuorum);
|
||||
cfg.zkUrl = zkUrl.append(":2181").toString();
|
||||
if (!znode.isEmpty()) {
|
||||
cfg.connectionString += cfg.connectionString + ":" + znode;
|
||||
cfg.zkUrl += cfg.zkUrl + ":" + znode;
|
||||
cfg.connectionString = connectionString.append(":").append(znode).toString();
|
||||
cfg.zkUrl=zkUrl.append(":").append(znode).toString();
|
||||
}
|
||||
}
|
||||
|
||||
private static void parseTableConfig(HbaseSQLReaderConfig cfg, Configuration dataxCfg) {
|
||||
// 解析并检查表名
|
||||
cfg.tableName = dataxCfg.getString(Key.TABLE);
|
||||
cfg.schema = dataxCfg.getString(Key.SCHEMA);
|
||||
if (cfg.tableName == null || cfg.tableName.isEmpty()) {
|
||||
throw DataXException.asDataXException(
|
||||
HbaseSQLReaderErrorCode.ILLEGAL_VALUE, "HBase的tableName配置不能为空,请检查并修改配置." );
|
||||
@ -124,13 +133,14 @@ public class HbaseSQLReaderConfig {
|
||||
HbaseSQLReaderErrorCode.ILLEGAL_VALUE, "您配置的tableName含有非法字符{0},请检查您的配置.");
|
||||
} else if (cfg.columns.isEmpty()) {
|
||||
try {
|
||||
cfg.columns = HbaseSQLHelper.getPColumnNames(cfg.connectionString, cfg.tableName);
|
||||
cfg.columns = HbaseSQLHelper.getPColumnNames(cfg.connectionString, cfg.tableName,cfg.schema);
|
||||
dataxCfg.set(Key.COLUMN, cfg.columns);
|
||||
} catch (SQLException e) {
|
||||
throw DataXException.asDataXException(
|
||||
HbaseSQLReaderErrorCode.GET_PHOENIX_COLUMN_ERROR, "HBase的columns配置不能为空,请添加目标表的列名配置." + e.getMessage(), e);
|
||||
}
|
||||
}
|
||||
cfg.where=dataxCfg.getString(Key.WHERE);
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -151,6 +161,8 @@ public class HbaseSQLReaderConfig {
|
||||
ret.append(",");
|
||||
}
|
||||
ret.setLength(ret.length() - 1);
|
||||
ret.append("[where=]").append(getWhere());
|
||||
ret.append("[schema=]").append(getSchema());
|
||||
ret.append("\n");
|
||||
|
||||
return ret.toString();
|
||||
@ -161,4 +173,20 @@ public class HbaseSQLReaderConfig {
|
||||
*/
|
||||
private HbaseSQLReaderConfig() {
|
||||
}
|
||||
|
||||
public String getWhere() {
|
||||
return where;
|
||||
}
|
||||
|
||||
public void setWhere(String where) {
|
||||
this.where = where;
|
||||
}
|
||||
|
||||
public String getSchema() {
|
||||
return schema;
|
||||
}
|
||||
|
||||
public void setSchema(String schema) {
|
||||
this.schema = schema;
|
||||
}
|
||||
}
|
||||
|
@ -19,10 +19,8 @@ import org.slf4j.LoggerFactory;
|
||||
import java.io.IOException;
|
||||
import java.math.BigDecimal;
|
||||
import java.sql.*;
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.sql.Date;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Created by admin on 1/3/18.
|
||||
@ -42,11 +40,14 @@ public class HbaseSQLReaderTask {
|
||||
}
|
||||
|
||||
private void getPColumns() throws SQLException {
|
||||
Properties pro = new Properties();
|
||||
pro.put(Key.NAME_SPACE_MAPPING_ENABLED, true);
|
||||
pro.put(Key.SYSTEM_TABLES_TO_NAMESPACE, true);
|
||||
Connection con =
|
||||
DriverManager.getConnection(this.readerConfig.getConnectionString());
|
||||
DriverManager.getConnection(this.readerConfig.getConnectionString(),pro);
|
||||
PhoenixConnection phoenixConnection = con.unwrap(PhoenixConnection.class);
|
||||
MetaDataClient metaDataClient = new MetaDataClient(phoenixConnection);
|
||||
PTable table = metaDataClient.updateCache("", this.readerConfig.getTableName()).getTable();
|
||||
PTable table = metaDataClient.updateCache(this.readerConfig.getSchema(), this.readerConfig.getTableName()).getTable();
|
||||
List<String> columnNames = this.readerConfig.getColumns();
|
||||
for (PColumn pColumn : table.getColumns()) {
|
||||
if (columnNames.contains(pColumn.getName().getString())) {
|
||||
|
@ -24,5 +24,18 @@ public final class Key {
|
||||
* 【必选】列配置
|
||||
*/
|
||||
public final static String COLUMN = "column";
|
||||
/**
|
||||
*
|
||||
*/
|
||||
public static final String WHERE = "where";
|
||||
|
||||
/**
|
||||
* 【可选】Phoenix表所属schema,默认为空
|
||||
*/
|
||||
public static final String SCHEMA = "schema";
|
||||
|
||||
public static final String NAME_SPACE_MAPPING_ENABLED = "phoenix.schema.isNamespaceMappingEnabled";
|
||||
|
||||
public static final String SYSTEM_TABLES_TO_NAMESPACE = "phoenix.schema.mapSystemTablesToNamespace";
|
||||
|
||||
}
|
||||
|
@ -1,5 +1,6 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<parent>
|
||||
<artifactId>datax-all</artifactId>
|
||||
<groupId>com.alibaba.datax</groupId>
|
||||
@ -111,6 +112,42 @@
|
||||
<version>${datax-project-version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.parquet</groupId>
|
||||
<artifactId>parquet-column</artifactId>
|
||||
<version>1.12.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.parquet</groupId>
|
||||
<artifactId>parquet-avro</artifactId>
|
||||
<version>1.12.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.parquet</groupId>
|
||||
<artifactId>parquet-common</artifactId>
|
||||
<version>1.12.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.parquet</groupId>
|
||||
<artifactId>parquet-format</artifactId>
|
||||
<version>2.3.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.parquet</groupId>
|
||||
<artifactId>parquet-jackson</artifactId>
|
||||
<version>1.12.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.parquet</groupId>
|
||||
<artifactId>parquet-encoding</artifactId>
|
||||
<version>1.12.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.parquet</groupId>
|
||||
<artifactId>parquet-hadoop</artifactId>
|
||||
<version>1.12.0</version>
|
||||
</dependency>
|
||||
|
||||
</dependencies>
|
||||
|
||||
<build>
|
||||
|
@ -37,6 +37,28 @@
|
||||
<!--</includes>-->
|
||||
<!--<outputDirectory>plugin/reader/hdfsreader/libs</outputDirectory>-->
|
||||
<!--</fileSet>-->
|
||||
<!--<fileSet>-->
|
||||
<!--<directory>src/main/libs</directory>-->
|
||||
<!--<includes>-->
|
||||
<!--<include>*.*</include>-->
|
||||
<!--</includes>-->
|
||||
<!--<outputDirectory>plugin/reader/hdfsreader/libs</outputDirectory>-->
|
||||
<!--</fileSet>-->
|
||||
|
||||
<fileSet>
|
||||
<directory>src/main/libs</directory>
|
||||
<includes>
|
||||
<include>*.*</include>
|
||||
</includes>
|
||||
<outputDirectory>plugin/reader/ossreader/libs</outputDirectory>
|
||||
</fileSet>
|
||||
<fileSet>
|
||||
<directory>src/main/libs</directory>
|
||||
<includes>
|
||||
<include>*.*</include>
|
||||
</includes>
|
||||
<outputDirectory>plugin/reader/hivereader/libs</outputDirectory>
|
||||
</fileSet>
|
||||
</fileSets>
|
||||
|
||||
<dependencySets>
|
||||
|
@ -10,4 +10,5 @@ public class Constant {
|
||||
public static final String CSV = "CSV";
|
||||
public static final String SEQ = "SEQ";
|
||||
public static final String RC = "RC";
|
||||
public static final String PARQUET = "PARQUET";
|
||||
}
|
||||
|
@ -9,12 +9,16 @@ import com.alibaba.datax.plugin.unstructuredstorage.reader.ColumnEntry;
|
||||
import com.alibaba.datax.plugin.unstructuredstorage.reader.UnstructuredStorageReaderErrorCode;
|
||||
import com.alibaba.datax.plugin.unstructuredstorage.reader.UnstructuredStorageReaderUtil;
|
||||
import com.alibaba.fastjson2.JSON;
|
||||
import com.alibaba.fastjson2.JSONArray;
|
||||
import com.alibaba.fastjson2.JSONObject;
|
||||
import org.apache.commons.lang3.BooleanUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.lang3.exception.ExceptionUtils;
|
||||
import org.apache.hadoop.fs.FSDataInputStream;
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.hive.common.type.HiveDecimal;
|
||||
import org.apache.hadoop.hive.ql.io.RCFile;
|
||||
import org.apache.hadoop.hive.ql.io.RCFileRecordReader;
|
||||
import org.apache.hadoop.hive.ql.io.orc.OrcFile;
|
||||
@ -29,14 +33,30 @@ import org.apache.hadoop.io.*;
|
||||
import org.apache.hadoop.mapred.*;
|
||||
import org.apache.hadoop.security.UserGroupInformation;
|
||||
import org.apache.hadoop.util.ReflectionUtils;
|
||||
import org.apache.parquet.example.data.Group;
|
||||
import org.apache.parquet.hadoop.ParquetReader;
|
||||
import org.apache.parquet.hadoop.example.GroupReadSupport;
|
||||
import org.apache.parquet.hadoop.util.HadoopInputFile;
|
||||
import org.apache.parquet.io.api.Binary;
|
||||
import org.apache.parquet.schema.MessageType;
|
||||
import org.apache.parquet.schema.MessageTypeParser;
|
||||
import org.apache.parquet.schema.PrimitiveType;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.ByteOrder;
|
||||
import java.sql.Timestamp;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.time.LocalDate;
|
||||
import java.time.LocalDateTime;
|
||||
import java.time.LocalTime;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* Created by mingya.wmy on 2015/8/12.
|
||||
@ -56,6 +76,10 @@ public class DFSUtil {
|
||||
public static final String HDFS_DEFAULTFS_KEY = "fs.defaultFS";
|
||||
public static final String HADOOP_SECURITY_AUTHENTICATION_KEY = "hadoop.security.authentication";
|
||||
|
||||
private Boolean skipEmptyOrcFile = false;
|
||||
|
||||
private Integer orcFileEmptySize = null;
|
||||
|
||||
|
||||
public DFSUtil(Configuration taskConfig) {
|
||||
hadoopConf = new org.apache.hadoop.conf.Configuration();
|
||||
@ -79,6 +103,7 @@ public class DFSUtil {
|
||||
this.hadoopConf.set(HADOOP_SECURITY_AUTHENTICATION_KEY, "kerberos");
|
||||
}
|
||||
this.kerberosAuthentication(this.kerberosPrincipal, this.kerberosKeytabFilePath);
|
||||
this.skipEmptyOrcFile = taskConfig.getBool(Key.SKIP_EMPTY_ORCFILE, false);
|
||||
|
||||
LOG.info(String.format("hadoopConfig details:%s", JSON.toJSONString(this.hadoopConf)));
|
||||
}
|
||||
@ -102,10 +127,11 @@ public class DFSUtil {
|
||||
* @param srcPaths 路径列表
|
||||
* @param specifiedFileType 指定文件类型
|
||||
*/
|
||||
public HashSet<String> getAllFiles(List<String> srcPaths, String specifiedFileType) {
|
||||
public HashSet<String> getAllFiles(List<String> srcPaths, String specifiedFileType, Boolean skipEmptyOrcFile, Integer orcFileEmptySize) {
|
||||
|
||||
this.specifiedFileType = specifiedFileType;
|
||||
|
||||
this.skipEmptyOrcFile = skipEmptyOrcFile;
|
||||
this.orcFileEmptySize = orcFileEmptySize;
|
||||
if (!srcPaths.isEmpty()) {
|
||||
for (String eachPath : srcPaths) {
|
||||
LOG.info(String.format("get HDFS all files in path = [%s]", eachPath));
|
||||
@ -127,9 +153,13 @@ public class DFSUtil {
|
||||
FileStatus stats[] = hdfs.globStatus(path);
|
||||
for (FileStatus f : stats) {
|
||||
if (f.isFile()) {
|
||||
if (f.getLen() == 0) {
|
||||
long fileLength = f.getLen();
|
||||
if (fileLength == 0) {
|
||||
String message = String.format("文件[%s]长度为0,将会跳过不作处理!", hdfsPath);
|
||||
LOG.warn(message);
|
||||
} else if (BooleanUtils.isTrue(this.skipEmptyOrcFile) && this.orcFileEmptySize != null && fileLength <= this.orcFileEmptySize) {
|
||||
String message = String.format("The orc file [%s] is empty, file size: %s, DataX will skip it !", f.getPath().toString(), fileLength);
|
||||
LOG.warn(message);
|
||||
} else {
|
||||
addSourceFileByType(f.getPath().toString());
|
||||
}
|
||||
@ -167,7 +197,16 @@ public class DFSUtil {
|
||||
LOG.info(String.format("[%s] 是目录, 递归获取该目录下的文件", f.getPath().toString()));
|
||||
getHDFSAllFilesNORegex(f.getPath().toString(), hdfs);
|
||||
} else if (f.isFile()) {
|
||||
|
||||
long fileLength = f.getLen();
|
||||
if (fileLength == 0) {
|
||||
String message = String.format("The file [%s] is empty, DataX will skip it !", f.getPath().toString());
|
||||
LOG.warn(message);
|
||||
continue;
|
||||
} else if (BooleanUtils.isTrue(this.skipEmptyOrcFile) && this.orcFileEmptySize != null && fileLength <= this.orcFileEmptySize) {
|
||||
String message = String.format("The orc file [%s] is empty, file size: %s, DataX will skip it !", f.getPath().toString(), fileLength);
|
||||
LOG.warn(message);
|
||||
continue;
|
||||
}
|
||||
addSourceFileByType(f.getPath().toString());
|
||||
} else {
|
||||
String message = String.format("该路径[%s]文件类型既不是目录也不是文件,插件自动忽略。",
|
||||
@ -332,7 +371,19 @@ public class DFSUtil {
|
||||
//Each file as a split
|
||||
//TODO multy threads
|
||||
// OrcInputFormat getSplits params numSplits not used, splits size = block numbers
|
||||
InputSplit[] splits = in.getSplits(conf, -1);
|
||||
InputSplit[] splits;
|
||||
try {
|
||||
splits = in.getSplits(conf, 1);
|
||||
} catch (Exception splitException) {
|
||||
if (Boolean.TRUE.equals(this.skipEmptyOrcFile)) {
|
||||
boolean isOrcFileEmptyException = checkIsOrcEmptyFileExecption(splitException);
|
||||
if (isOrcFileEmptyException) {
|
||||
LOG.info("skipEmptyOrcFile: true, \"{}\" is an empty orc file, skip it!", sourceOrcFilePath);
|
||||
return;
|
||||
}
|
||||
}
|
||||
throw splitException;
|
||||
}
|
||||
for (InputSplit split : splits) {
|
||||
{
|
||||
RecordReader reader = in.getRecordReader(split, conf, Reporter.NULL);
|
||||
@ -349,8 +400,11 @@ public class DFSUtil {
|
||||
Object field = inspector.getStructFieldData(value, fields.get(i));
|
||||
recordFields.add(field);
|
||||
}
|
||||
List<ColumnEntry> hivePartitionColumnEntrys = UnstructuredStorageReaderUtil.getListColumnEntry(readerSliceConfig, com.alibaba.datax.plugin.unstructuredstorage.reader.Key.HIVE_PARTION_COLUMN);
|
||||
ArrayList<Column> hivePartitionColumns = new ArrayList<>();
|
||||
hivePartitionColumns = UnstructuredStorageReaderUtil.getHivePartitionColumns(sourceOrcFilePath, hivePartitionColumnEntrys);
|
||||
transportOneRecord(column, recordFields, recordSender,
|
||||
taskPluginCollector, isReadAllColumns, nullFormat);
|
||||
taskPluginCollector, isReadAllColumns, nullFormat,hivePartitionColumns);
|
||||
}
|
||||
reader.close();
|
||||
}
|
||||
@ -367,8 +421,20 @@ public class DFSUtil {
|
||||
}
|
||||
}
|
||||
|
||||
private boolean checkIsOrcEmptyFileExecption(Exception e) {
|
||||
if (e == null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
String fullStackTrace = ExceptionUtils.getStackTrace(e);
|
||||
if (fullStackTrace.contains("org.apache.orc.impl.ReaderImpl.getRawDataSizeOfColumn") && fullStackTrace.contains("Caused by: java.lang.IndexOutOfBoundsException: Index: 1, Size: 1")) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private Record transportOneRecord(List<ColumnEntry> columnConfigs, List<Object> recordFields
|
||||
, RecordSender recordSender, TaskPluginCollector taskPluginCollector, boolean isReadAllColumns, String nullFormat) {
|
||||
, RecordSender recordSender, TaskPluginCollector taskPluginCollector, boolean isReadAllColumns, String nullFormat, ArrayList<Column> hiveParitionColumns) {
|
||||
Record record = recordSender.createRecord();
|
||||
Column columnGenerated;
|
||||
try {
|
||||
@ -555,8 +621,9 @@ public class DFSUtil {
|
||||
} else if (StringUtils.equalsIgnoreCase(specifiedFileType, Constant.SEQ)) {
|
||||
|
||||
return isSequenceFile(filepath, in);
|
||||
} else if (StringUtils.equalsIgnoreCase(specifiedFileType, Constant.PARQUET)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
} catch (Exception e) {
|
||||
String message = String.format("检查文件[%s]类型失败,目前支持ORC,SEQUENCE,RCFile,TEXT,CSV五种格式的文件," +
|
||||
"请检查您文件类型和文件是否正确。", filepath);
|
||||
@ -693,4 +760,332 @@ public class DFSUtil {
|
||||
return false;
|
||||
}
|
||||
|
||||
public void parquetFileStartRead(String sourceParquetFilePath, Configuration readerSliceConfig, RecordSender recordSender, TaskPluginCollector taskPluginCollector) {
|
||||
String schemaString = readerSliceConfig.getString(Key.PARQUET_SCHEMA);
|
||||
if (StringUtils.isNotBlank(schemaString)) {
|
||||
LOG.info("You config parquet schema, use it {}", schemaString);
|
||||
} else {
|
||||
schemaString = getParquetSchema(sourceParquetFilePath, hadoopConf);
|
||||
LOG.info("Parquet schema parsed from: {} , schema is {}", sourceParquetFilePath, schemaString);
|
||||
if (StringUtils.isBlank(schemaString)) {
|
||||
throw DataXException.asDataXException("ParquetSchema is required, please check your config");
|
||||
}
|
||||
}
|
||||
MessageType parquetSchema = null;
|
||||
List<org.apache.parquet.schema.Type> parquetTypes = null;
|
||||
Map<String, ParquetMeta> parquetMetaMap = null;
|
||||
int fieldCount = 0;
|
||||
try {
|
||||
parquetSchema = MessageTypeParser.parseMessageType(schemaString);
|
||||
fieldCount = parquetSchema.getFieldCount();
|
||||
parquetTypes = parquetSchema.getFields();
|
||||
parquetMetaMap = ParquetMessageHelper.parseParquetTypes(parquetTypes);
|
||||
} catch (Exception e) {
|
||||
String message = String.format("Error parsing to MessageType via Schema string [%s]", schemaString);
|
||||
LOG.error(message);
|
||||
throw DataXException.asDataXException(HdfsReaderErrorCode.PARSE_MESSAGE_TYPE_FROM_SCHEMA_ERROR, e);
|
||||
}
|
||||
List<ColumnEntry> column = UnstructuredStorageReaderUtil.getListColumnEntry(readerSliceConfig, com.alibaba.datax.plugin.unstructuredstorage.reader.Key.COLUMN);
|
||||
String nullFormat = readerSliceConfig.getString(com.alibaba.datax.plugin.unstructuredstorage.reader.Key.NULL_FORMAT);
|
||||
boolean isUtcTimestamp = readerSliceConfig.getBool(Key.PARQUET_UTC_TIMESTAMP, false);
|
||||
boolean isReadAllColumns = (column == null || column.size() == 0) ? true : false;
|
||||
LOG.info("ReadingAllColums: " + isReadAllColumns);
|
||||
|
||||
/**
|
||||
* 支持 hive 表中间加列场景
|
||||
*
|
||||
* 开关默认 false,在 hive表存在中间加列的场景打开,需要根据 name排序
|
||||
* 不默认打开的原因
|
||||
* 1、存量hdfs任务,只根据 index获取字段,无name字段配置
|
||||
* 2、中间加列场景比较少
|
||||
* 3、存量任务可能存在列错位的问题,不能随意纠正
|
||||
*/
|
||||
boolean supportAddMiddleColumn = readerSliceConfig.getBool(Key.SUPPORT_ADD_MIDDLE_COLUMN, false);
|
||||
|
||||
boolean printNullValueException = readerSliceConfig.getBool("printNullValueException", false);
|
||||
List<Integer> ignoreIndex = readerSliceConfig.getList("ignoreIndex", new ArrayList<Integer>(), Integer.class);
|
||||
|
||||
JobConf conf = new JobConf(hadoopConf);
|
||||
ParquetReader<Group> reader = null;
|
||||
try {
|
||||
Path parquetFilePath = new Path(sourceParquetFilePath);
|
||||
GroupReadSupport readSupport = new GroupReadSupport();
|
||||
readSupport.init(conf, null, parquetSchema);
|
||||
// 这里初始化parquetReader的时候,会getFileSystem,如果是HA集群,期间会根据hadoopConfig中区加载failover类,这里初始化builder带上conf
|
||||
ParquetReader.Builder parquetReaderBuilder = ParquetReader.builder(readSupport, parquetFilePath);
|
||||
parquetReaderBuilder.withConf(hadoopConf);
|
||||
reader = parquetReaderBuilder.build();
|
||||
Group g = null;
|
||||
|
||||
// 从文件名中解析分区信息
|
||||
List<ColumnEntry> hivePartitionColumnEntrys = UnstructuredStorageReaderUtil.getListColumnEntry(readerSliceConfig, com.alibaba.datax.plugin.unstructuredstorage.reader.Key.HIVE_PARTION_COLUMN);
|
||||
ArrayList<Column> hivePartitionColumns = new ArrayList<>();
|
||||
hivePartitionColumns = UnstructuredStorageReaderUtil.getHivePartitionColumns(sourceParquetFilePath, hivePartitionColumnEntrys);
|
||||
List<String> schemaFieldList = null;
|
||||
Map<Integer, String> colNameIndexMap = null;
|
||||
Map<Integer, Integer> indexMap = null;
|
||||
if (supportAddMiddleColumn) {
|
||||
boolean nonName = column.stream().anyMatch(columnEntry -> StringUtils.isEmpty(columnEntry.getName()));
|
||||
if (nonName) {
|
||||
throw new DataXException("You configured column item without name, please correct it");
|
||||
}
|
||||
List<org.apache.parquet.schema.Type> parquetFileFields = getParquetFileFields(parquetFilePath, hadoopConf);
|
||||
schemaFieldList = parquetFileFields.stream().map(org.apache.parquet.schema.Type::getName).collect(Collectors.toList());
|
||||
colNameIndexMap = new ConcurrentHashMap<>();
|
||||
Map<Integer, String> finalColNameIndexMap = colNameIndexMap;
|
||||
column.forEach(columnEntry -> finalColNameIndexMap.put(columnEntry.getIndex(), columnEntry.getName()));
|
||||
Iterator<Map.Entry<Integer, String>> iterator = finalColNameIndexMap.entrySet().iterator();
|
||||
while (iterator.hasNext()) {
|
||||
Map.Entry<Integer, String> next = iterator.next();
|
||||
if (!schemaFieldList.contains(next.getValue())) {
|
||||
finalColNameIndexMap.remove((next.getKey()));
|
||||
}
|
||||
}
|
||||
LOG.info("SupportAddMiddleColumn is true, fields from parquet file is {}, " +
|
||||
"colNameIndexMap is {}", JSON.toJSONString(schemaFieldList), JSON.toJSONString(colNameIndexMap));
|
||||
fieldCount = column.size();
|
||||
indexMap = new HashMap<>();
|
||||
for (int j = 0; j < fieldCount; j++) {
|
||||
if (colNameIndexMap.containsKey(j)) {
|
||||
int index = findIndex(schemaFieldList, findEleInMap(colNameIndexMap, j));
|
||||
indexMap.put(j, index);
|
||||
}
|
||||
}
|
||||
}
|
||||
while ((g = reader.read()) != null) {
|
||||
List<Object> formattedRecord = new ArrayList<Object>(fieldCount);
|
||||
try {
|
||||
for (int j = 0; j < fieldCount; j++) {
|
||||
Object data = null;
|
||||
try {
|
||||
if (null != ignoreIndex && !ignoreIndex.isEmpty() && ignoreIndex.contains(j)) {
|
||||
data = null;
|
||||
} else {
|
||||
if (supportAddMiddleColumn) {
|
||||
if (!colNameIndexMap.containsKey(j)) {
|
||||
formattedRecord.add(null);
|
||||
continue;
|
||||
} else {
|
||||
data = DFSUtil.this.readFields(g, parquetTypes.get(indexMap.get(j)), indexMap.get(j), parquetMetaMap, isUtcTimestamp);
|
||||
}
|
||||
} else {
|
||||
data = DFSUtil.this.readFields(g, parquetTypes.get(j), j, parquetMetaMap, isUtcTimestamp);
|
||||
}
|
||||
}
|
||||
} catch (RuntimeException e) {
|
||||
if (printNullValueException) {
|
||||
LOG.warn(e.getMessage());
|
||||
}
|
||||
}
|
||||
formattedRecord.add(data);
|
||||
}
|
||||
transportOneRecord(column, formattedRecord, recordSender, taskPluginCollector, isReadAllColumns, nullFormat, hivePartitionColumns);
|
||||
} catch (Exception e) {
|
||||
throw DataXException.asDataXException(HdfsReaderErrorCode.READ_PARQUET_ERROR, e);
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
throw DataXException.asDataXException(HdfsReaderErrorCode.READ_PARQUET_ERROR, e);
|
||||
} finally {
|
||||
org.apache.commons.io.IOUtils.closeQuietly(reader);
|
||||
}
|
||||
}
|
||||
|
||||
private String findEleInMap(Map<Integer, String> map, Integer key) {
|
||||
Iterator<Map.Entry<Integer, String>> iterator = map.entrySet().iterator();
|
||||
while (iterator.hasNext()) {
|
||||
Map.Entry<Integer, String> next = iterator.next();
|
||||
if (key.equals(next.getKey())) {
|
||||
return next.getValue();
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private int findIndex(List<String> schemaFieldList, String colName) {
|
||||
for (int i = 0; i < schemaFieldList.size(); i++) {
|
||||
if (schemaFieldList.get(i).equals(colName)) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
private List<org.apache.parquet.schema.Type> getParquetFileFields(Path filePath, org.apache.hadoop.conf.Configuration configuration) {
|
||||
try (org.apache.parquet.hadoop.ParquetFileReader reader = org.apache.parquet.hadoop.ParquetFileReader.open(HadoopInputFile.fromPath(filePath, configuration))) {
|
||||
org.apache.parquet.schema.MessageType schema = reader.getFooter().getFileMetaData().getSchema();
|
||||
List<org.apache.parquet.schema.Type> fields = schema.getFields();
|
||||
return fields;
|
||||
} catch (IOException e) {
|
||||
LOG.error("Fetch parquet field error", e);
|
||||
throw new DataXException(String.format("Fetch parquet field error, msg is %s", e.getMessage()));
|
||||
}
|
||||
}
|
||||
|
||||
private String getParquetSchema(String sourceParquetFilePath, org.apache.hadoop.conf.Configuration hadoopConf) {
|
||||
GroupReadSupport readSupport = new GroupReadSupport();
|
||||
ParquetReader.Builder parquetReaderBuilder = ParquetReader.builder(readSupport, new Path(sourceParquetFilePath));
|
||||
ParquetReader<Group> reader = null;
|
||||
try {
|
||||
parquetReaderBuilder.withConf(hadoopConf);
|
||||
reader = parquetReaderBuilder.build();
|
||||
Group g = null;
|
||||
if ((g = reader.read()) != null) {
|
||||
return g.getType().toString();
|
||||
}
|
||||
} catch (Throwable e) {
|
||||
LOG.error("Inner error, getParquetSchema failed, message is {}", e.getMessage());
|
||||
} finally {
|
||||
org.apache.commons.io.IOUtils.closeQuietly(reader);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* parquet 相关
|
||||
*/
|
||||
private static final int JULIAN_EPOCH_OFFSET_DAYS = 2440588;
|
||||
private static final long MILLIS_IN_DAY = TimeUnit.DAYS.toMillis(1);
|
||||
private static final long NANOS_PER_MILLISECOND = TimeUnit.MILLISECONDS.toNanos(1);
|
||||
|
||||
private long julianDayToMillis(int julianDay) {
|
||||
return (julianDay - JULIAN_EPOCH_OFFSET_DAYS) * MILLIS_IN_DAY;
|
||||
}
|
||||
|
||||
private org.apache.parquet.schema.OriginalType getOriginalType(org.apache.parquet.schema.Type type, Map<String, ParquetMeta> parquetMetaMap) {
|
||||
ParquetMeta meta = parquetMetaMap.get(type.getName());
|
||||
return meta.getOriginalType();
|
||||
}
|
||||
|
||||
private org.apache.parquet.schema.PrimitiveType asPrimitiveType(org.apache.parquet.schema.Type type, Map<String, ParquetMeta> parquetMetaMap) {
|
||||
ParquetMeta meta = parquetMetaMap.get(type.getName());
|
||||
return meta.getPrimitiveType();
|
||||
}
|
||||
|
||||
private Object readFields(Group g, org.apache.parquet.schema.Type type, int index, Map<String, ParquetMeta> parquetMetaMap, boolean isUtcTimestamp) {
|
||||
if (this.getOriginalType(type, parquetMetaMap) == org.apache.parquet.schema.OriginalType.MAP) {
|
||||
Group groupData = g.getGroup(index, 0);
|
||||
List<org.apache.parquet.schema.Type> parquetTypes = groupData.getType().getFields();
|
||||
JSONObject data = new JSONObject();
|
||||
for (int i = 0; i < parquetTypes.size(); i++) {
|
||||
int j = groupData.getFieldRepetitionCount(i);
|
||||
// map key value 的对数
|
||||
for (int k = 0; k < j; k++) {
|
||||
Group groupDataK = groupData.getGroup(0, k);
|
||||
List<org.apache.parquet.schema.Type> parquetTypesK = groupDataK.getType().getFields();
|
||||
if (2 != parquetTypesK.size()) {
|
||||
// warn: 不是key value成对出现
|
||||
throw new RuntimeException(String.format("bad parquet map type: %s", groupData.getValueToString(index, 0)));
|
||||
}
|
||||
Object subDataKey = this.readFields(groupDataK, parquetTypesK.get(0), 0, parquetMetaMap, isUtcTimestamp);
|
||||
Object subDataValue = this.readFields(groupDataK, parquetTypesK.get(1), 1, parquetMetaMap, isUtcTimestamp);
|
||||
if (StringUtils.equalsIgnoreCase("key", parquetTypesK.get(0).getName())) {
|
||||
((JSONObject) data).put(subDataKey.toString(), subDataValue);
|
||||
} else {
|
||||
((JSONObject) data).put(subDataValue.toString(), subDataKey);
|
||||
}
|
||||
}
|
||||
}
|
||||
return data;
|
||||
} else if (this.getOriginalType(type, parquetMetaMap) == org.apache.parquet.schema.OriginalType.MAP_KEY_VALUE) {
|
||||
Group groupData = g.getGroup(index, 0);
|
||||
List<org.apache.parquet.schema.Type> parquetTypes = groupData.getType().getFields();
|
||||
JSONObject data = new JSONObject();
|
||||
for (int i = 0; i < parquetTypes.size(); i++) {
|
||||
int j = groupData.getFieldRepetitionCount(i);
|
||||
// map key value 的对数
|
||||
for (int k = 0; k < j; k++) {
|
||||
Group groupDataK = groupData.getGroup(0, k);
|
||||
List<org.apache.parquet.schema.Type> parquetTypesK = groupDataK.getType().getFields();
|
||||
if (2 != parquetTypesK.size()) {
|
||||
// warn: 不是key value成对出现
|
||||
throw new RuntimeException(String.format("bad parquet map type: %s", groupData.getValueToString(index, 0)));
|
||||
}
|
||||
Object subDataKey = this.readFields(groupDataK, parquetTypesK.get(0), 0, parquetMetaMap, isUtcTimestamp);
|
||||
Object subDataValue = this.readFields(groupDataK, parquetTypesK.get(1), 1, parquetMetaMap, isUtcTimestamp);
|
||||
if (StringUtils.equalsIgnoreCase("key", parquetTypesK.get(0).getName())) {
|
||||
((JSONObject) data).put(subDataKey.toString(), subDataValue);
|
||||
} else {
|
||||
((JSONObject) data).put(subDataValue.toString(), subDataKey);
|
||||
}
|
||||
}
|
||||
}
|
||||
return data;
|
||||
} else if (this.getOriginalType(type, parquetMetaMap) == org.apache.parquet.schema.OriginalType.LIST) {
|
||||
Group groupData = g.getGroup(index, 0);
|
||||
List<org.apache.parquet.schema.Type> parquetTypes = groupData.getType().getFields();
|
||||
JSONArray data = new JSONArray();
|
||||
for (int i = 0; i < parquetTypes.size(); i++) {
|
||||
Object subData = this.readFields(groupData, parquetTypes.get(i), i, parquetMetaMap, isUtcTimestamp);
|
||||
data.add(subData);
|
||||
}
|
||||
return data;
|
||||
} else if (this.getOriginalType(type, parquetMetaMap) == org.apache.parquet.schema.OriginalType.DECIMAL) {
|
||||
Binary binaryDate = g.getBinary(index, 0);
|
||||
if (null == binaryDate) {
|
||||
return null;
|
||||
} else {
|
||||
org.apache.hadoop.hive.serde2.io.HiveDecimalWritable decimalWritable = new org.apache.hadoop.hive.serde2.io.HiveDecimalWritable(binaryDate.getBytes(), this.asPrimitiveType(type, parquetMetaMap).getDecimalMetadata().getScale());
|
||||
// g.getType().getFields().get(1).asPrimitiveType().getDecimalMetadata().getScale()
|
||||
HiveDecimal hiveDecimal = decimalWritable.getHiveDecimal();
|
||||
if (null == hiveDecimal) {
|
||||
return null;
|
||||
} else {
|
||||
return hiveDecimal.bigDecimalValue();
|
||||
}
|
||||
// return decimalWritable.doubleValue();
|
||||
}
|
||||
} else if (this.getOriginalType(type, parquetMetaMap) == org.apache.parquet.schema.OriginalType.DATE) {
|
||||
return java.sql.Date.valueOf(LocalDate.ofEpochDay(g.getInteger(index, 0)));
|
||||
} else if (this.getOriginalType(type, parquetMetaMap) == org.apache.parquet.schema.OriginalType.UTF8) {
|
||||
return g.getValueToString(index, 0);
|
||||
} else {
|
||||
if (type.isPrimitive()) {
|
||||
PrimitiveType.PrimitiveTypeName primitiveTypeName = this.asPrimitiveType(type, parquetMetaMap).getPrimitiveTypeName();
|
||||
if (PrimitiveType.PrimitiveTypeName.BINARY == primitiveTypeName) {
|
||||
return g.getValueToString(index, 0);
|
||||
} else if (PrimitiveType.PrimitiveTypeName.BOOLEAN == primitiveTypeName) {
|
||||
return g.getValueToString(index, 0);
|
||||
} else if (PrimitiveType.PrimitiveTypeName.DOUBLE == primitiveTypeName) {
|
||||
return g.getValueToString(index, 0);
|
||||
} else if (PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY == primitiveTypeName) {
|
||||
return g.getValueToString(index, 0);
|
||||
} else if (PrimitiveType.PrimitiveTypeName.FLOAT == primitiveTypeName) {
|
||||
return g.getValueToString(index, 0);
|
||||
} else if (PrimitiveType.PrimitiveTypeName.INT32 == primitiveTypeName) {
|
||||
return g.getValueToString(index, 0);
|
||||
} else if (PrimitiveType.PrimitiveTypeName.INT64 == primitiveTypeName) {
|
||||
return g.getValueToString(index, 0);
|
||||
} else if (PrimitiveType.PrimitiveTypeName.INT96 == primitiveTypeName) {
|
||||
Binary dataInt96 = g.getInt96(index, 0);
|
||||
if (null == dataInt96) {
|
||||
return null;
|
||||
} else {
|
||||
ByteBuffer buf = dataInt96.toByteBuffer();
|
||||
buf.order(ByteOrder.LITTLE_ENDIAN);
|
||||
long timeOfDayNanos = buf.getLong();
|
||||
int julianDay = buf.getInt();
|
||||
if (isUtcTimestamp) {
|
||||
// UTC
|
||||
LocalDate localDate = LocalDate.ofEpochDay(julianDay - JULIAN_EPOCH_OFFSET_DAYS);
|
||||
LocalTime localTime = LocalTime.ofNanoOfDay(timeOfDayNanos);
|
||||
return Timestamp.valueOf(LocalDateTime.of(localDate, localTime));
|
||||
} else {
|
||||
// local time
|
||||
long mills = julianDayToMillis(julianDay) + (timeOfDayNanos / NANOS_PER_MILLISECOND);
|
||||
Timestamp timestamp = new Timestamp(mills);
|
||||
timestamp.setNanos((int) (timeOfDayNanos % TimeUnit.SECONDS.toNanos(1)));
|
||||
return timestamp;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
return g.getValueToString(index, 0);
|
||||
}
|
||||
} else {
|
||||
return g.getValueToString(index, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
@ -0,0 +1,21 @@
|
||||
package com.alibaba.datax.plugin.reader.hdfsreader;
|
||||
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.fs.PathFilter;
|
||||
|
||||
/**
|
||||
* Created by wmy on 16/11/29.
|
||||
*/
|
||||
public class HdfsPathFilter implements PathFilter {
|
||||
|
||||
private String regex = null;
|
||||
|
||||
public HdfsPathFilter(String regex) {
|
||||
this.regex = regex;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean accept(Path path) {
|
||||
return regex != null ? path.getName().matches(regex) : true;
|
||||
}
|
||||
}
|
@ -41,6 +41,8 @@ public class HdfsReader extends Reader {
|
||||
private String specifiedFileType = null;
|
||||
private DFSUtil dfsUtil = null;
|
||||
private List<String> path = null;
|
||||
private boolean skipEmptyOrcFile = false;
|
||||
private Integer orcFileEmptySize = null;
|
||||
|
||||
@Override
|
||||
public void init() {
|
||||
@ -81,9 +83,10 @@ public class HdfsReader extends Reader {
|
||||
!specifiedFileType.equalsIgnoreCase(Constant.TEXT) &&
|
||||
!specifiedFileType.equalsIgnoreCase(Constant.CSV) &&
|
||||
!specifiedFileType.equalsIgnoreCase(Constant.SEQ) &&
|
||||
!specifiedFileType.equalsIgnoreCase(Constant.RC)){
|
||||
String message = "HdfsReader插件目前支持ORC, TEXT, CSV, SEQUENCE, RC五种格式的文件," +
|
||||
"请将fileType选项的值配置为ORC, TEXT, CSV, SEQUENCE 或者 RC";
|
||||
!specifiedFileType.equalsIgnoreCase(Constant.RC) &&
|
||||
!specifiedFileType.equalsIgnoreCase(Constant.PARQUET)){
|
||||
String message = "HdfsReader插件目前支持ORC, TEXT, CSV, SEQUENCE, RC, PARQUET 六种格式的文件," +
|
||||
"请将fileType选项的值配置为ORC, TEXT, CSV, SEQUENCE,RC 和 PARQUET";
|
||||
throw DataXException.asDataXException(HdfsReaderErrorCode.FILE_TYPE_ERROR, message);
|
||||
}
|
||||
|
||||
@ -115,6 +118,16 @@ public class HdfsReader extends Reader {
|
||||
UnstructuredStorageReaderUtil.validateCompress(this.readerOriginConfig);
|
||||
UnstructuredStorageReaderUtil.validateCsvReaderConfig(this.readerOriginConfig);
|
||||
}
|
||||
if (this.specifiedFileType.equalsIgnoreCase(Constant.ORC)) {
|
||||
skipEmptyOrcFile = this.readerOriginConfig.getBool(Key.SKIP_EMPTY_ORCFILE, false);
|
||||
orcFileEmptySize = this.readerOriginConfig.getInt(Key.ORCFILE_EMPTYSIZE);
|
||||
//将orcFileEmptySize必填项检查去掉,仅需要配置skipEmptyOrcFile即可,考虑历史任务兼容性(For中华保险),保留orcFileEmptySize参数配置
|
||||
//if (skipEmptyOrcFile && orcFileEmptySize == null) {
|
||||
// throw new IllegalArgumentException("When \"skipEmptyOrcFile\" is configured, "
|
||||
// + "parameter \"orcFileEmptySize\" cannot be null.");
|
||||
//}
|
||||
}
|
||||
LOG.info("skipEmptyOrcFile: {}, orcFileEmptySize: {}", skipEmptyOrcFile, orcFileEmptySize);
|
||||
|
||||
}
|
||||
|
||||
@ -166,7 +179,7 @@ public class HdfsReader extends Reader {
|
||||
@Override
|
||||
public void prepare() {
|
||||
LOG.info("prepare(), start to getAllFiles...");
|
||||
this.sourceFiles = dfsUtil.getAllFiles(path, specifiedFileType);
|
||||
this.sourceFiles = dfsUtil.getAllFiles(path, specifiedFileType,skipEmptyOrcFile, orcFileEmptySize);
|
||||
LOG.info(String.format("您即将读取的文件数为: [%s], 列表为: [%s]",
|
||||
this.sourceFiles.size(),
|
||||
StringUtils.join(this.sourceFiles, ",")));
|
||||
@ -273,7 +286,9 @@ public class HdfsReader extends Reader {
|
||||
}else if(specifiedFileType.equalsIgnoreCase(Constant.RC)){
|
||||
|
||||
dfsUtil.rcFileStartRead(sourceFile, this.taskConfig, recordSender, this.getTaskPluginCollector());
|
||||
}else {
|
||||
} else if (specifiedFileType.equalsIgnoreCase(Constant.PARQUET)) {
|
||||
dfsUtil.parquetFileStartRead(sourceFile, this.taskConfig, recordSender, this.getTaskPluginCollector());
|
||||
} else {
|
||||
|
||||
String message = "HdfsReader插件目前支持ORC, TEXT, CSV, SEQUENCE, RC五种格式的文件," +
|
||||
"请将fileType选项的值配置为ORC, TEXT, CSV, SEQUENCE 或者 RC";
|
||||
|
@ -19,7 +19,12 @@ public enum HdfsReaderErrorCode implements ErrorCode {
|
||||
FILE_TYPE_UNSUPPORT("HdfsReader-12", "文件类型目前不支持"),
|
||||
KERBEROS_LOGIN_ERROR("HdfsReader-13", "KERBEROS认证失败"),
|
||||
READ_SEQUENCEFILE_ERROR("HdfsReader-14", "读取SequenceFile文件出错"),
|
||||
READ_RCFILE_ERROR("HdfsReader-15", "读取RCFile文件出错"),;
|
||||
READ_RCFILE_ERROR("HdfsReader-15", "读取RCFile文件出错"),
|
||||
INIT_RCFILE_SERDE_ERROR("HdfsReader-16", "Deserialize RCFile, initialization failed!"),
|
||||
PARSE_MESSAGE_TYPE_FROM_SCHEMA_ERROR("HdfsReader-17", "Error parsing ParquetSchema"),
|
||||
INVALID_PARQUET_SCHEMA("HdfsReader-18", "ParquetSchema is invalid"),
|
||||
READ_PARQUET_ERROR("HdfsReader-19", "Error reading Parquet file"),
|
||||
CONNECT_HDFS_IO_ERROR("HdfsReader-20", "I/O exception in establishing connection with HDFS");
|
||||
|
||||
private final String code;
|
||||
private final String description;
|
||||
|
@ -7,9 +7,60 @@ public final class Key {
|
||||
*/
|
||||
public final static String PATH = "path";
|
||||
public final static String DEFAULT_FS = "defaultFS";
|
||||
public final static String HIVE_VERSION = "hiveVersion";
|
||||
public static final String FILETYPE = "fileType";
|
||||
public static final String HADOOP_CONFIG = "hadoopConfig";
|
||||
public static final String HAVE_KERBEROS = "haveKerberos";
|
||||
public static final String KERBEROS_KEYTAB_FILE_PATH = "kerberosKeytabFilePath";
|
||||
public static final String KERBEROS_CONF_FILE_PATH = "kerberosConfFilePath";
|
||||
public static final String KERBEROS_PRINCIPAL = "kerberosPrincipal";
|
||||
public static final String PATH_FILTER = "pathFilter";
|
||||
public static final String PARQUET_SCHEMA = "parquetSchema";
|
||||
/**
|
||||
* hive 3.x 或 cdh高版本,使用UTC时区存储时间戳,如果发现时区偏移,该配置项要配置成 true
|
||||
*/
|
||||
public static final String PARQUET_UTC_TIMESTAMP = "parquetUtcTimestamp";
|
||||
public static final String SUCCESS_ON_NO_FILE = "successOnNoFile";
|
||||
public static final String PROTECTION = "protection";
|
||||
|
||||
/**
|
||||
* 用于显示地指定hdfs客户端的用户名
|
||||
*/
|
||||
public static final String HDFS_USERNAME = "hdfsUsername";
|
||||
|
||||
/**
|
||||
* ORC FILE空文件大小
|
||||
*/
|
||||
public static final String ORCFILE_EMPTYSIZE = "orcFileEmptySize";
|
||||
|
||||
/**
|
||||
* 是否跳过空的OrcFile
|
||||
*/
|
||||
public static final String SKIP_EMPTY_ORCFILE = "skipEmptyOrcFile";
|
||||
|
||||
/**
|
||||
* 是否跳过 orc meta 信息
|
||||
*/
|
||||
|
||||
public static final String SKIP_ORC_META = "skipOrcMetaInfo";
|
||||
/**
|
||||
* 过滤_或者.开头的文件
|
||||
*/
|
||||
public static final String REGEX_PATTERN = "^.*[/][^._].*";
|
||||
|
||||
public static final String FILTER_TAG_FILE = "filterTagFile";
|
||||
|
||||
// high level params refs https://github.com/aliyun/alibabacloud-jindodata/blob/master/docs/user/4.x/4.4.0/oss/configuration/jindosdk_configuration_list.md
|
||||
// <!-- oss 并发下载任务队列大小 -->
|
||||
public static final String FS_OSS_DOWNLOAD_QUEUE_SIZE = "ossDownloadQueueSize";
|
||||
|
||||
// <!-- 进程内 oss 最大并发下载任务数 -->
|
||||
public static final String FS_OSS_DOWNLOAD_THREAD_CONCURRENCY = "ossDownloadThreadConcurrency";
|
||||
|
||||
public static final String FS_OSS_READ_READAHEAD_BUFFER_COUNT = "ossDownloadBufferCount";
|
||||
|
||||
public static final String FILE_SYSTEM_TYPE = "fileSystemType";
|
||||
public static final String CDH_3_X_HIVE_VERSION = "3.1.3-cdh";
|
||||
|
||||
public static final String SUPPORT_ADD_MIDDLE_COLUMN = "supportAddMiddleColumn";
|
||||
}
|
||||
|
@ -0,0 +1,33 @@
|
||||
package com.alibaba.datax.plugin.reader.hdfsreader;
|
||||
|
||||
import org.apache.parquet.schema.OriginalType;
|
||||
import org.apache.parquet.schema.PrimitiveType;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* @author jitongchen
|
||||
* @date 2023/9/7 10:20 AM
|
||||
*/
|
||||
public class ParquetMessageHelper {
|
||||
public static Map<String, ParquetMeta> parseParquetTypes(List<org.apache.parquet.schema.Type> parquetTypes) {
|
||||
int fieldCount = parquetTypes.size();
|
||||
Map<String, ParquetMeta> parquetMetaMap = new HashMap<String, ParquetMeta>();
|
||||
for (int i = 0; i < fieldCount; i++) {
|
||||
org.apache.parquet.schema.Type type = parquetTypes.get(i);
|
||||
String name = type.getName();
|
||||
ParquetMeta parquetMeta = new ParquetMeta();
|
||||
parquetMeta.setName(name);
|
||||
OriginalType originalType = type.getOriginalType();
|
||||
parquetMeta.setOriginalType(originalType);
|
||||
if (type.isPrimitive()) {
|
||||
PrimitiveType primitiveType = type.asPrimitiveType();
|
||||
parquetMeta.setPrimitiveType(primitiveType);
|
||||
}
|
||||
parquetMetaMap.put(name, parquetMeta);
|
||||
}
|
||||
return parquetMetaMap;
|
||||
}
|
||||
}
|
@ -0,0 +1,38 @@
|
||||
package com.alibaba.datax.plugin.reader.hdfsreader;
|
||||
|
||||
import org.apache.parquet.schema.OriginalType;
|
||||
import org.apache.parquet.schema.PrimitiveType;
|
||||
|
||||
/**
|
||||
* @author jitongchen
|
||||
* @date 2023/9/7 10:20 AM
|
||||
*/
|
||||
public class ParquetMeta {
|
||||
private String name;
|
||||
private OriginalType originalType;
|
||||
private PrimitiveType primitiveType;
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public void setName(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public OriginalType getOriginalType() {
|
||||
return originalType;
|
||||
}
|
||||
|
||||
public void setOriginalType(OriginalType originalType) {
|
||||
this.originalType = originalType;
|
||||
}
|
||||
|
||||
public PrimitiveType getPrimitiveType() {
|
||||
return primitiveType;
|
||||
}
|
||||
|
||||
public void setPrimitiveType(PrimitiveType primitiveType) {
|
||||
this.primitiveType = primitiveType;
|
||||
}
|
||||
}
|
@ -27,9 +27,8 @@ import org.apache.hadoop.mapred.*;
|
||||
import org.apache.hadoop.security.UserGroupInformation;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import parquet.schema.OriginalType;
|
||||
import parquet.schema.PrimitiveType;
|
||||
import parquet.schema.Types;
|
||||
import parquet.hadoop.metadata.CompressionCodecName;
|
||||
import parquet.schema.*;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.text.SimpleDateFormat;
|
||||
@ -626,4 +625,61 @@ public class HdfsHelper {
|
||||
}
|
||||
return typeBuilder.named("m").toString();
|
||||
}
|
||||
|
||||
public void parquetFileStartWrite(RecordReceiver lineReceiver, Configuration config, String fileName, TaskPluginCollector taskPluginCollector, Configuration taskConfig) {
|
||||
MessageType messageType = null;
|
||||
ParquetFileProccessor proccessor = null;
|
||||
Path outputPath = new Path(fileName);
|
||||
String schema = config.getString(Key.PARQUET_SCHEMA);
|
||||
try {
|
||||
messageType = MessageTypeParser.parseMessageType(schema);
|
||||
} catch (Exception e) {
|
||||
String message = String.format("Error parsing the Schema string [%s] into MessageType", schema);
|
||||
LOG.error(message);
|
||||
throw DataXException.asDataXException(HdfsWriterErrorCode.PARSE_MESSAGE_TYPE_FROM_SCHEMA_ERROR, e);
|
||||
}
|
||||
|
||||
// determine the compression codec
|
||||
String compress = config.getString(Key.COMPRESS, null);
|
||||
// be compatible with the old NONE
|
||||
if ("NONE".equalsIgnoreCase(compress)) {
|
||||
compress = "UNCOMPRESSED";
|
||||
}
|
||||
CompressionCodecName compressionCodecName = CompressionCodecName.fromConf(compress);
|
||||
LOG.info("The compression codec used for parquet writing is: {}", compressionCodecName, compress);
|
||||
try {
|
||||
proccessor = new ParquetFileProccessor(outputPath, messageType, compressionCodecName, false, taskConfig, taskPluginCollector, hadoopConf);
|
||||
} catch (Exception e) {
|
||||
String message = String.format("Initializing ParquetFileProccessor based on Schema[%s] failed.", schema);
|
||||
LOG.error(message);
|
||||
throw DataXException.asDataXException(HdfsWriterErrorCode.INIT_PROCCESSOR_FAILURE, e);
|
||||
}
|
||||
SimpleDateFormat dateFormat = new SimpleDateFormat("yyyyMMddHHmm");
|
||||
String attempt = "attempt_" + dateFormat.format(new Date()) + "_0001_m_000000_0";
|
||||
conf.set(JobContext.TASK_ATTEMPT_ID, attempt);
|
||||
FileOutputFormat outFormat = new TextOutputFormat();
|
||||
outFormat.setOutputPath(conf, outputPath);
|
||||
outFormat.setWorkOutputPath(conf, outputPath);
|
||||
try {
|
||||
Record record = null;
|
||||
while ((record = lineReceiver.getFromReader()) != null) {
|
||||
proccessor.write(record);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
String message = String.format("An exception occurred while writing the file file [%s]", fileName);
|
||||
LOG.error(message);
|
||||
Path path = new Path(fileName);
|
||||
deleteDir(path.getParent());
|
||||
throw DataXException.asDataXException(HdfsWriterErrorCode.Write_FILE_IO_ERROR, e);
|
||||
} finally {
|
||||
if (proccessor != null) {
|
||||
try {
|
||||
proccessor.close();
|
||||
} catch (IOException e) {
|
||||
LOG.error(e.getMessage(), e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -53,8 +53,8 @@ public class HdfsWriter extends Writer {
|
||||
this.defaultFS = this.writerSliceConfig.getNecessaryValue(Key.DEFAULT_FS, HdfsWriterErrorCode.REQUIRED_VALUE);
|
||||
//fileType check
|
||||
this.fileType = this.writerSliceConfig.getNecessaryValue(Key.FILE_TYPE, HdfsWriterErrorCode.REQUIRED_VALUE);
|
||||
if( !fileType.equalsIgnoreCase("ORC") && !fileType.equalsIgnoreCase("TEXT")){
|
||||
String message = "HdfsWriter插件目前只支持ORC和TEXT两种格式的文件,请将filetype选项的值配置为ORC或者TEXT";
|
||||
if (!fileType.equalsIgnoreCase("ORC") && !fileType.equalsIgnoreCase("TEXT") && !fileType.equalsIgnoreCase("PARQUET")) {
|
||||
String message = "HdfsWriter插件目前只支持ORC、TEXT、PARQUET三种格式的文件,请将filetype选项的值配置为ORC、TEXT或PARQUET";
|
||||
throw DataXException.asDataXException(HdfsWriterErrorCode.ILLEGAL_VALUE, message);
|
||||
}
|
||||
//path
|
||||
@ -415,6 +415,9 @@ public class HdfsWriter extends Writer {
|
||||
//写ORC FILE
|
||||
hdfsHelper.orcFileStartWrite(lineReceiver,this.writerSliceConfig, this.fileName,
|
||||
this.getTaskPluginCollector());
|
||||
} else if (fileType.equalsIgnoreCase("PARQUET")) {
|
||||
//写PARQUET FILE
|
||||
hdfsHelper.parquetFileStartWrite(lineReceiver, this.writerSliceConfig, this.fileName, this.getTaskPluginCollector(), this.writerSliceConfig);
|
||||
}
|
||||
|
||||
LOG.info("end do write");
|
||||
|
@ -16,7 +16,11 @@ public enum HdfsWriterErrorCode implements ErrorCode {
|
||||
CONNECT_HDFS_IO_ERROR("HdfsWriter-06", "与HDFS建立连接时出现IO异常."),
|
||||
COLUMN_REQUIRED_VALUE("HdfsWriter-07", "您column配置中缺失了必须填写的参数值."),
|
||||
HDFS_RENAME_FILE_ERROR("HdfsWriter-08", "将文件移动到配置路径失败."),
|
||||
KERBEROS_LOGIN_ERROR("HdfsWriter-09", "KERBEROS认证失败");
|
||||
KERBEROS_LOGIN_ERROR("HdfsWriter-09", "KERBEROS认证失败"),
|
||||
PARSE_MESSAGE_TYPE_FROM_SCHEMA_ERROR("HdfsWriter-10", "Parse parquet schema error"),
|
||||
|
||||
INIT_PROCCESSOR_FAILURE("HdfsWriter-11", "Init processor failed");
|
||||
|
||||
|
||||
private final String code;
|
||||
private final String description;
|
||||
|
@ -46,4 +46,32 @@ public class Key {
|
||||
|
||||
public static final String PARQUET_SCHEMA = "parquetSchema";
|
||||
public static final String PARQUET_MERGE_RESULT = "parquetMergeResult";
|
||||
|
||||
/**
|
||||
* hive 3.x 或 cdh高版本,使用UTC时区存储时间戳,如果发现时区偏移,该配置项要配置成 true
|
||||
*/
|
||||
public static final String PARQUET_UTC_TIMESTAMP = "parquetUtcTimestamp";
|
||||
|
||||
// Kerberos
|
||||
public static final String KERBEROS_CONF_FILE_PATH = "kerberosConfFilePath";
|
||||
|
||||
// PanguFS
|
||||
public final static String PANGU_FS_CONFIG = "panguFSConfig";
|
||||
public final static String PANGU_FS_CONFIG_NUWA_CLUSTER = "nuwaCluster";
|
||||
public final static String PANGU_FS_CONFIG_NUWA_SERVERS = "nuwaServers";
|
||||
public final static String PANGU_FS_CONFIG_NUWA_PROXIES = "nuwaProxies";
|
||||
public final static String PANGU_FS_CONFIG_CAPABILITY = "capability";
|
||||
|
||||
|
||||
public static final String FS_OSS_UPLOAD_THREAD_CONCURRENCY = "ossUploadConcurrency";
|
||||
// <!-- oss 并发上传任务队列大小 -->
|
||||
public static final String FS_OSS_UPLOAD_QUEUE_SIZE = "ossUploadQueueSize";
|
||||
// <!-- 进程内 oss 最大并发上传任务数 -->
|
||||
public static final String FS_OSS_UPLOAD_MAX_PENDING_TASKS_PER_STREAM = "ossUploadMaxPendingTasksPerStream";
|
||||
|
||||
public static final String FS_OSS_BLOCKLET_SIZE_MB = "ossBlockSize";
|
||||
|
||||
public static final String FILE_SYSTEM_TYPE = "fileSystemType";
|
||||
public static final String ENABLE_COLUMN_EXCHANGE = "enableColumnExchange";
|
||||
public static final String SUPPORT_HIVE_DATETIME = "supportHiveDateTime";
|
||||
}
|
||||
|
@ -0,0 +1,30 @@
|
||||
package com.alibaba.datax.plugin.writer.hdfswriter;
|
||||
|
||||
import com.alibaba.datax.common.element.Record;
|
||||
import com.alibaba.datax.common.plugin.TaskPluginCollector;
|
||||
import com.alibaba.datax.common.util.Configuration;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import parquet.hadoop.ParquetWriter;
|
||||
import parquet.hadoop.metadata.CompressionCodecName;
|
||||
import parquet.schema.MessageType;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* @author jitongchen
|
||||
* @date 2023/9/7 9:41 AM
|
||||
*/
|
||||
public class ParquetFileProccessor extends ParquetWriter<Record> {
|
||||
|
||||
public ParquetFileProccessor(Path file, MessageType schema, boolean enableDictionary, Configuration taskConfig, TaskPluginCollector taskPluginCollector, org.apache.hadoop.conf.Configuration configuration) throws IOException {
|
||||
this(file, schema, CompressionCodecName.UNCOMPRESSED, enableDictionary, taskConfig, taskPluginCollector, configuration);
|
||||
}
|
||||
|
||||
public ParquetFileProccessor(Path file, MessageType schema, CompressionCodecName codecName, boolean enableDictionary, Configuration taskConfig, TaskPluginCollector taskPluginCollector) throws IOException {
|
||||
super(file, new ParquetFileSupport(schema, taskConfig, taskPluginCollector), codecName, DEFAULT_BLOCK_SIZE, DEFAULT_PAGE_SIZE, DEFAULT_PAGE_SIZE, enableDictionary, false, DEFAULT_WRITER_VERSION);
|
||||
}
|
||||
|
||||
public ParquetFileProccessor(Path file, MessageType schema, CompressionCodecName codecName, boolean enableDictionary, Configuration taskConfig, TaskPluginCollector taskPluginCollector, org.apache.hadoop.conf.Configuration configuration) throws IOException {
|
||||
super(file, new ParquetFileSupport(schema, taskConfig, taskPluginCollector), codecName, DEFAULT_BLOCK_SIZE, DEFAULT_PAGE_SIZE, DEFAULT_PAGE_SIZE, enableDictionary, false, DEFAULT_WRITER_VERSION, configuration);
|
||||
}
|
||||
}
|
@ -0,0 +1,642 @@
|
||||
package com.alibaba.datax.plugin.writer.hdfswriter;
|
||||
|
||||
import com.alibaba.datax.common.element.*;
|
||||
import com.alibaba.datax.common.plugin.TaskPluginCollector;
|
||||
import com.alibaba.datax.common.util.LimitLogger;
|
||||
import com.alibaba.fastjson2.JSON;
|
||||
import com.alibaba.fastjson2.JSONArray;
|
||||
import com.alibaba.fastjson2.JSONObject;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import parquet.column.ColumnDescriptor;
|
||||
import parquet.hadoop.api.WriteSupport;
|
||||
import parquet.io.api.Binary;
|
||||
import parquet.io.api.RecordConsumer;
|
||||
import parquet.schema.*;
|
||||
|
||||
import java.math.BigDecimal;
|
||||
import java.math.RoundingMode;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.ByteOrder;
|
||||
import java.sql.Timestamp;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.time.LocalDateTime;
|
||||
import java.time.OffsetDateTime;
|
||||
import java.time.ZoneOffset;
|
||||
import java.time.temporal.ChronoField;
|
||||
import java.util.Arrays;
|
||||
import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
/**
|
||||
* @author jitongchen
|
||||
* @date 2023/9/7 9:41 AM
|
||||
*/
|
||||
public class ParquetFileSupport extends WriteSupport<Record> {
|
||||
public static final Logger LOGGER = LoggerFactory.getLogger(ParquetFileSupport.class);
|
||||
private MessageType schema;
|
||||
private List<ColumnDescriptor> columns;
|
||||
private RecordConsumer recordConsumer;
|
||||
private boolean useRawDataTransf = true;
|
||||
private boolean printStackTrace = true;
|
||||
|
||||
// 不通类型的nullFormat
|
||||
private String nullFormat;
|
||||
|
||||
private String dateFormat;
|
||||
private boolean isUtcTimestamp;
|
||||
private SimpleDateFormat dateParse;
|
||||
private Binary binaryForNull;
|
||||
private TaskPluginCollector taskPluginCollector;
|
||||
private String dataxParquetMode;
|
||||
|
||||
public ParquetFileSupport(MessageType schema, com.alibaba.datax.common.util.Configuration taskConfig, TaskPluginCollector taskPluginCollector) {
|
||||
this.schema = schema;
|
||||
this.columns = schema.getColumns();
|
||||
this.useRawDataTransf = taskConfig.getBool(Key.PARQUET_FILE_USE_RAW_DATA_TRANSF, true);
|
||||
|
||||
// 不通类型的nullFormat
|
||||
this.nullFormat = taskConfig.getString(Key.NULL_FORMAT, Constant.DEFAULT_NULL_FORMAT);
|
||||
this.binaryForNull = Binary.fromString(this.nullFormat);
|
||||
|
||||
this.dateFormat = taskConfig.getString(Key.DATE_FORMAT, null);
|
||||
if (StringUtils.isNotBlank(this.dateFormat)) {
|
||||
this.dateParse = new SimpleDateFormat(dateFormat);
|
||||
}
|
||||
|
||||
this.isUtcTimestamp = taskConfig.getBool(Key.PARQUET_UTC_TIMESTAMP, false);
|
||||
|
||||
this.taskPluginCollector = taskPluginCollector;
|
||||
if (taskConfig.getKeys().contains("dataxParquetMode")) {
|
||||
this.dataxParquetMode = taskConfig.getString("dataxParquetMode");
|
||||
} else {
|
||||
// 默认值是columns
|
||||
this.dataxParquetMode = "columns";
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public WriteContext init(Configuration configuration) {
|
||||
return new WriteContext(schema, new HashMap<String, String>());
|
||||
}
|
||||
|
||||
@Override
|
||||
public void prepareForWrite(RecordConsumer recordConsumer) {
|
||||
this.recordConsumer = recordConsumer;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void write(Record values) {
|
||||
if (dataxParquetMode.equalsIgnoreCase("fields")) {
|
||||
writeBaseOnFields(values);
|
||||
return;
|
||||
}
|
||||
|
||||
// NOTE: 下面的实现其实是不对的,只是看代码注释貌似有用户已经在用
|
||||
// 所以暂时不动下面的逻辑。
|
||||
// 默认走的就是下面的这条代码路径
|
||||
if (values != null && columns != null && values.getColumnNumber() == columns.size()) {
|
||||
recordConsumer.startMessage();
|
||||
for (int i = 0; i < columns.size(); i++) {
|
||||
Column value = values.getColumn(i);
|
||||
ColumnDescriptor columnDescriptor = columns.get(i);
|
||||
Type type = this.schema.getFields().get(i);
|
||||
if (value != null) {
|
||||
try {
|
||||
if (this.useRawDataTransf) {
|
||||
if (value.getRawData() == null) {
|
||||
continue;
|
||||
}
|
||||
recordConsumer.startField(columnDescriptor.getPath()[0], i);
|
||||
// 原来使用Column->RawData的方法其实是错误的类型转换策略,会将DataX的数据内部表示形象序列化出去
|
||||
// 但是 Parquet 已经有用户使用了,故暂时只是配置项切换
|
||||
String rawData = value.getRawData().toString();
|
||||
switch (columnDescriptor.getType()) {
|
||||
case BOOLEAN:
|
||||
recordConsumer.addBoolean(Boolean.parseBoolean(rawData));
|
||||
break;
|
||||
case FLOAT:
|
||||
recordConsumer.addFloat(Float.parseFloat(rawData));
|
||||
break;
|
||||
case DOUBLE:
|
||||
recordConsumer.addDouble(Double.parseDouble(rawData));
|
||||
break;
|
||||
case INT32:
|
||||
OriginalType originalType = type.getOriginalType();
|
||||
if (originalType != null && StringUtils.equalsIgnoreCase("DATE", originalType.name())) {
|
||||
int realVal = (int) (new java.sql.Date(Long.parseLong(rawData)).toLocalDate().toEpochDay());
|
||||
recordConsumer.addInteger(realVal);
|
||||
} else {
|
||||
recordConsumer.addInteger(Integer.parseInt(rawData));
|
||||
}
|
||||
break;
|
||||
case INT64:
|
||||
recordConsumer.addLong(Long.valueOf(rawData));
|
||||
break;
|
||||
case INT96:
|
||||
recordConsumer.addBinary(timestampColToBinary(value));
|
||||
break;
|
||||
case BINARY:
|
||||
recordConsumer.addBinary(Binary.fromString(rawData));
|
||||
break;
|
||||
case FIXED_LEN_BYTE_ARRAY:
|
||||
PrimitiveType primitiveType = type.asPrimitiveType();
|
||||
if (primitiveType.getDecimalMetadata() != null) {
|
||||
// decimal
|
||||
recordConsumer.addBinary(decimalToBinary(value, primitiveType.getDecimalMetadata().getPrecision(), primitiveType.getDecimalMetadata().getScale()));
|
||||
break;
|
||||
}
|
||||
/* fall through */
|
||||
default:
|
||||
recordConsumer.addBinary(Binary.fromString(rawData));
|
||||
break;
|
||||
}
|
||||
|
||||
recordConsumer.endField(columnDescriptor.getPath()[0], i);
|
||||
} else {
|
||||
boolean isNull = null == value.getRawData();
|
||||
|
||||
if (!isNull) {
|
||||
recordConsumer.startField(columnDescriptor.getPath()[0], i);
|
||||
|
||||
// no skip: empty fields are illegal, the field should be ommited completely instead
|
||||
switch (columnDescriptor.getType()) {
|
||||
case BOOLEAN:
|
||||
recordConsumer.addBoolean(value.asBoolean());
|
||||
break;
|
||||
case FLOAT:
|
||||
recordConsumer.addFloat(value.asDouble().floatValue());
|
||||
break;
|
||||
case DOUBLE:
|
||||
recordConsumer.addDouble(value.asDouble());
|
||||
break;
|
||||
case INT32:
|
||||
OriginalType originalType = type.getOriginalType();
|
||||
if (originalType != null && StringUtils.equalsIgnoreCase("DATE", originalType.name())) {
|
||||
int realVal = (int) (new java.sql.Date(value.asLong()).toLocalDate().toEpochDay());
|
||||
recordConsumer.addInteger(realVal);
|
||||
} else {
|
||||
recordConsumer.addInteger(value.asLong().intValue());
|
||||
}
|
||||
break;
|
||||
case INT64:
|
||||
recordConsumer.addLong(value.asLong());
|
||||
break;
|
||||
case INT96:
|
||||
recordConsumer.addBinary(timestampColToBinary(value));
|
||||
break;
|
||||
case BINARY:
|
||||
String valueAsString2Write = null;
|
||||
if (Column.Type.DATE == value.getType() && null != this.dateParse) {
|
||||
valueAsString2Write = dateParse.format(value.asDate());
|
||||
} else {
|
||||
valueAsString2Write = value.asString();
|
||||
}
|
||||
recordConsumer.addBinary(Binary.fromString(valueAsString2Write));
|
||||
break;
|
||||
case FIXED_LEN_BYTE_ARRAY:
|
||||
PrimitiveType primitiveType = type.asPrimitiveType();
|
||||
if (primitiveType.getDecimalMetadata() != null) {
|
||||
// decimal
|
||||
recordConsumer.addBinary(decimalToBinary(value, primitiveType.getDecimalMetadata().getPrecision(), primitiveType.getDecimalMetadata().getScale()));
|
||||
break;
|
||||
}
|
||||
/* fall through */
|
||||
default:
|
||||
recordConsumer.addBinary(Binary.fromString(value.asString()));
|
||||
break;
|
||||
}
|
||||
recordConsumer.endField(columnDescriptor.getPath()[0], i);
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
if (printStackTrace) {
|
||||
printStackTrace = false;
|
||||
LOGGER.warn("write to parquet error: {}", e.getMessage(), e);
|
||||
}
|
||||
// dirty data
|
||||
if (null != this.taskPluginCollector) {
|
||||
// job post 里面的merge taskPluginCollector 为null
|
||||
this.taskPluginCollector.collectDirtyRecord(values, e, e.getMessage());
|
||||
}
|
||||
}
|
||||
} else {
|
||||
recordConsumer.addBinary(this.binaryForNull);
|
||||
}
|
||||
}
|
||||
recordConsumer.endMessage();
|
||||
}
|
||||
}
|
||||
|
||||
private Binary decimalToBinary(Column value, int precision, int scale) {
|
||||
BigDecimal bigDecimal = value.asBigDecimal();
|
||||
bigDecimal = bigDecimal.setScale(scale, RoundingMode.HALF_UP);
|
||||
byte[] decimalBytes = bigDecimal.unscaledValue().toByteArray();
|
||||
|
||||
int precToBytes = ParquetHiveSerDe.PRECISION_TO_BYTE_COUNT[precision - 1];
|
||||
if (precToBytes == decimalBytes.length) {
|
||||
// No padding needed.
|
||||
return Binary.fromByteArray(decimalBytes);
|
||||
}
|
||||
|
||||
byte[] tgt = new byte[precToBytes];
|
||||
|
||||
// padding -1 for negative number
|
||||
if (bigDecimal.compareTo(new BigDecimal("0")) < 0) {
|
||||
Arrays.fill(tgt, 0, precToBytes - decimalBytes.length, (byte) -1);
|
||||
}
|
||||
|
||||
System.arraycopy(decimalBytes, 0, tgt, precToBytes - decimalBytes.length, decimalBytes.length);
|
||||
return Binary.fromByteArray(tgt);
|
||||
}
|
||||
|
||||
private static final int JULIAN_EPOCH_OFFSET_DAYS = 2_440_588;
|
||||
private static final long MILLIS_IN_DAY = TimeUnit.DAYS.toMillis(1);
|
||||
private static final long MILLS_PER_SECOND = TimeUnit.SECONDS.toMillis(1);
|
||||
private static final long NANOS_PER_DAY = TimeUnit.DAYS.toNanos(1);
|
||||
private static final long NANOS_PER_SECOND = TimeUnit.SECONDS.toNanos(1);
|
||||
private static final ZoneOffset defaultOffset = OffsetDateTime.now().getOffset();
|
||||
|
||||
/**
|
||||
* int 96 is timestamp in parquet
|
||||
*
|
||||
* @param valueColumn
|
||||
* @return
|
||||
*/
|
||||
private Binary timestampColToBinary(Column valueColumn) {
|
||||
if (valueColumn.getRawData() == null) {
|
||||
return Binary.EMPTY;
|
||||
}
|
||||
long mills;
|
||||
long nanos = 0;
|
||||
if (valueColumn instanceof DateColumn) {
|
||||
DateColumn dateColumn = (DateColumn) valueColumn;
|
||||
mills = dateColumn.asLong();
|
||||
nanos = dateColumn.getNanos();
|
||||
} else {
|
||||
mills = valueColumn.asLong();
|
||||
}
|
||||
int julianDay;
|
||||
long nanosOfDay;
|
||||
if (isUtcTimestamp) {
|
||||
// utc ignore current timezone (task should set timezone same as hive/hdfs)
|
||||
long seconds = mills >= 0 ? mills / MILLS_PER_SECOND : (mills / MILLS_PER_SECOND - 1);
|
||||
LocalDateTime localDateTime = LocalDateTime.ofEpochSecond(seconds, (int) nanos, defaultOffset);
|
||||
julianDay = (int) (localDateTime.getLong(ChronoField.EPOCH_DAY) + JULIAN_EPOCH_OFFSET_DAYS);
|
||||
nanosOfDay = localDateTime.getLong(ChronoField.NANO_OF_DAY);
|
||||
} else {
|
||||
// local date
|
||||
julianDay = (int) ((mills / MILLIS_IN_DAY) + JULIAN_EPOCH_OFFSET_DAYS);
|
||||
if (mills >= 0) {
|
||||
nanosOfDay = ((mills % MILLIS_IN_DAY) / MILLS_PER_SECOND) * NANOS_PER_SECOND + nanos;
|
||||
} else {
|
||||
julianDay--;
|
||||
nanosOfDay = (((mills % MILLIS_IN_DAY) / MILLS_PER_SECOND) - 1) * NANOS_PER_SECOND + nanos;
|
||||
nanosOfDay += NANOS_PER_DAY;
|
||||
}
|
||||
}
|
||||
|
||||
ByteBuffer buf = ByteBuffer.allocate(12);
|
||||
buf.order(ByteOrder.LITTLE_ENDIAN);
|
||||
buf.putLong(nanosOfDay);
|
||||
buf.putInt(julianDay);
|
||||
buf.flip();
|
||||
return Binary.fromByteBuffer(buf);
|
||||
}
|
||||
|
||||
private void writeBaseOnFields(Record values) {
|
||||
//LOGGER.info("Writing parquet data using fields mode(The correct mode.)");
|
||||
List<Type> types = this.schema.getFields();
|
||||
|
||||
if (values != null && types != null && values.getColumnNumber() == types.size()) {
|
||||
recordConsumer.startMessage();
|
||||
writeFields(types, values);
|
||||
recordConsumer.endMessage();
|
||||
}
|
||||
}
|
||||
|
||||
private void writeFields(List<Type> types, Record values) {
|
||||
for (int i = 0; i < types.size(); i++) {
|
||||
Type type = types.get(i);
|
||||
Column value = values.getColumn(i);
|
||||
if (value != null) {
|
||||
try {
|
||||
if (type.isPrimitive()) {
|
||||
writePrimitiveType(type, value, i);
|
||||
} else {
|
||||
writeGroupType(type, (JSON) JSON.parse(value.asString()), i);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
if (printStackTrace) {
|
||||
printStackTrace = false;
|
||||
LOGGER.warn("write to parquet error: {}", e.getMessage(), e);
|
||||
}
|
||||
// dirty data
|
||||
if (null != this.taskPluginCollector) {
|
||||
// job post 里面的merge taskPluginCollector 为null
|
||||
this.taskPluginCollector.collectDirtyRecord(values, e, e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void writeFields(List<Type> types, JSONObject values) {
|
||||
for (int i = 0; i < types.size(); i++) {
|
||||
Type type = types.get(i);
|
||||
Object value = values.get(type.getName());
|
||||
|
||||
if (value != null) {
|
||||
try {
|
||||
if (type.isPrimitive()) {
|
||||
writePrimitiveType(type, value, i);
|
||||
} else {
|
||||
writeGroupType(type, (JSON) value, i);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
if (printStackTrace) {
|
||||
printStackTrace = false;
|
||||
LOGGER.warn("write to parquet error: {}", e.getMessage(), e);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
recordConsumer.addBinary(this.binaryForNull);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void writeGroupType(Type type, JSON value, int index) {
|
||||
GroupType groupType = type.asGroupType();
|
||||
OriginalType originalType = groupType.getOriginalType();
|
||||
if (originalType != null) {
|
||||
switch (originalType) {
|
||||
case MAP:
|
||||
writeMap(groupType, value, index);
|
||||
break;
|
||||
case LIST:
|
||||
writeList(groupType, value, index);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
// struct
|
||||
writeStruct(groupType, value, index);
|
||||
}
|
||||
}
|
||||
|
||||
private void writeMap(GroupType groupType, JSON value, int index) {
|
||||
if (value == null) {
|
||||
return;
|
||||
}
|
||||
|
||||
JSONObject json = (JSONObject) value;
|
||||
|
||||
if (json.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
recordConsumer.startField(groupType.getName(), index);
|
||||
|
||||
recordConsumer.startGroup();
|
||||
|
||||
// map
|
||||
// key_value start
|
||||
recordConsumer.startField("key_value", 0);
|
||||
recordConsumer.startGroup();
|
||||
|
||||
List<Type> keyValueFields = groupType.getFields().get(0).asGroupType().getFields();
|
||||
Type keyType = keyValueFields.get(0);
|
||||
Type valueType = keyValueFields.get(1);
|
||||
for (String key : json.keySet()) {
|
||||
// key
|
||||
writePrimitiveType(keyType, key, 0);
|
||||
|
||||
// value
|
||||
if (valueType.isPrimitive()) {
|
||||
writePrimitiveType(valueType, json.get(key), 1);
|
||||
} else {
|
||||
writeGroupType(valueType, (JSON) json.get(key), 1);
|
||||
}
|
||||
}
|
||||
|
||||
recordConsumer.endGroup();
|
||||
recordConsumer.endField("key_value", 0);
|
||||
// key_value end
|
||||
|
||||
recordConsumer.endGroup();
|
||||
recordConsumer.endField(groupType.getName(), index);
|
||||
}
|
||||
|
||||
private void writeList(GroupType groupType, JSON value, int index) {
|
||||
if (value == null) {
|
||||
return;
|
||||
}
|
||||
|
||||
JSONArray json = (JSONArray) value;
|
||||
|
||||
if (json.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
recordConsumer.startField(groupType.getName(), index);
|
||||
// list
|
||||
recordConsumer.startGroup();
|
||||
|
||||
// list start
|
||||
recordConsumer.startField("list", 0);
|
||||
recordConsumer.startGroup();
|
||||
|
||||
Type elementType = groupType.getFields().get(0).asGroupType().getFields().get(0);
|
||||
|
||||
if (elementType.isPrimitive()) {
|
||||
for (Object elementValue : json) {
|
||||
writePrimitiveType(elementType, elementValue, 0);
|
||||
}
|
||||
} else {
|
||||
for (Object elementValue : json) {
|
||||
writeGroupType(elementType, (JSON) elementValue, 0);
|
||||
}
|
||||
}
|
||||
|
||||
recordConsumer.endGroup();
|
||||
recordConsumer.endField("list", 0);
|
||||
// list end
|
||||
recordConsumer.endGroup();
|
||||
|
||||
recordConsumer.endField(groupType.getName(), index);
|
||||
}
|
||||
|
||||
private void writeStruct(GroupType groupType, JSON value, int index) {
|
||||
if (value == null) {
|
||||
return;
|
||||
}
|
||||
JSONObject json = (JSONObject) value;
|
||||
if (json.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
recordConsumer.startField(groupType.getName(), index);
|
||||
// struct start
|
||||
recordConsumer.startGroup();
|
||||
|
||||
writeFields(groupType.getFields(), json);
|
||||
recordConsumer.endGroup();
|
||||
// struct end
|
||||
recordConsumer.endField(groupType.getName(), index);
|
||||
}
|
||||
|
||||
private void writePrimitiveType(Type type, Object value, int index) {
|
||||
if (value == null) {
|
||||
return;
|
||||
}
|
||||
|
||||
recordConsumer.startField(type.getName(), index);
|
||||
PrimitiveType primitiveType = type.asPrimitiveType();
|
||||
|
||||
switch (primitiveType.getPrimitiveTypeName()) {
|
||||
case BOOLEAN:
|
||||
recordConsumer.addBoolean((Boolean) value);
|
||||
break;
|
||||
case FLOAT:
|
||||
if (value instanceof Float) {
|
||||
recordConsumer.addFloat(((Float) value).floatValue());
|
||||
} else if (value instanceof Double) {
|
||||
recordConsumer.addFloat(((Double) value).floatValue());
|
||||
} else if (value instanceof Long) {
|
||||
recordConsumer.addFloat(((Long) value).floatValue());
|
||||
} else if (value instanceof Integer) {
|
||||
recordConsumer.addFloat(((Integer) value).floatValue());
|
||||
}
|
||||
break;
|
||||
case DOUBLE:
|
||||
if (value instanceof Float) {
|
||||
recordConsumer.addDouble(((Float) value).doubleValue());
|
||||
} else if (value instanceof Double) {
|
||||
recordConsumer.addDouble(((Double) value).doubleValue());
|
||||
} else if (value instanceof Long) {
|
||||
recordConsumer.addDouble(((Long) value).doubleValue());
|
||||
} else if (value instanceof Integer) {
|
||||
recordConsumer.addDouble(((Integer) value).doubleValue());
|
||||
}
|
||||
break;
|
||||
case INT32:
|
||||
if (value instanceof Integer) {
|
||||
recordConsumer.addInteger((Integer) value);
|
||||
} else if (value instanceof Long) {
|
||||
recordConsumer.addInteger(((Long) value).intValue());
|
||||
} else {
|
||||
// 之前代码写的有问题,导致这里丢列了没抛异常,先收集,后续看看有没有任务命中在决定怎么改
|
||||
LimitLogger.limit("dirtyDataHiveWriterParquet", TimeUnit.MINUTES.toMillis(1), () -> LOGGER.warn("dirtyDataHiveWriterParquet {}", String.format("Invalid value: %s(clazz: %s) for field: %s", value, value.getClass(), type.getName())));
|
||||
}
|
||||
break;
|
||||
case INT64:
|
||||
if (value instanceof Integer) {
|
||||
recordConsumer.addLong(((Integer) value).longValue());
|
||||
} else if (value instanceof Long) {
|
||||
recordConsumer.addInteger(((Long) value).intValue());
|
||||
} else {
|
||||
// 之前代码写的有问题,导致这里丢列了没抛异常,先收集,后续看看有没有任务命中在决定怎么改
|
||||
LimitLogger.limit("dirtyDataHiveWriterParquet", TimeUnit.MINUTES.toMillis(1), () -> LOGGER.warn("dirtyDataHiveWriterParquet {}", String.format("Invalid value: %s(clazz: %s) for field: %s", value, value.getClass(), type.getName())));
|
||||
}
|
||||
break;
|
||||
case INT96:
|
||||
if (value instanceof Integer) {
|
||||
recordConsumer.addBinary(timestampColToBinary(new LongColumn((Integer) value)));
|
||||
} else if (value instanceof Long) {
|
||||
recordConsumer.addBinary(timestampColToBinary(new LongColumn((Long) value)));
|
||||
} else if (value instanceof Timestamp) {
|
||||
recordConsumer.addBinary(timestampColToBinary(new DateColumn((Timestamp) value)));
|
||||
} else if (value instanceof Date) {
|
||||
recordConsumer.addBinary(timestampColToBinary(new DateColumn((Date) value)));
|
||||
} else {
|
||||
recordConsumer.addBinary(timestampColToBinary(new StringColumn(value.toString())));
|
||||
}
|
||||
break;
|
||||
case FIXED_LEN_BYTE_ARRAY:
|
||||
if (primitiveType.getDecimalMetadata() != null) {
|
||||
// decimal
|
||||
Column column;
|
||||
if (value instanceof Integer) {
|
||||
column = new LongColumn((Integer) value);
|
||||
} else if (value instanceof Long) {
|
||||
column = new LongColumn((Long) value);
|
||||
} else if (value instanceof Double) {
|
||||
column = new DoubleColumn((Double) value);
|
||||
} else if (value instanceof BigDecimal) {
|
||||
column = new DoubleColumn((BigDecimal) value);
|
||||
} else {
|
||||
column = new StringColumn(value.toString());
|
||||
}
|
||||
recordConsumer.addBinary(decimalToBinary(column, primitiveType.getDecimalMetadata().getPrecision(), primitiveType.getDecimalMetadata().getScale()));
|
||||
break;
|
||||
}
|
||||
/* fall through */
|
||||
case BINARY:
|
||||
default:
|
||||
recordConsumer.addBinary(Binary.fromString((String) value));
|
||||
break;
|
||||
}
|
||||
recordConsumer.endField(type.getName(), index);
|
||||
}
|
||||
|
||||
private void writePrimitiveType(Type type, Column value, int index) {
|
||||
if (value == null || value.getRawData() == null) {
|
||||
return;
|
||||
}
|
||||
|
||||
recordConsumer.startField(type.getName(), index);
|
||||
PrimitiveType primitiveType = type.asPrimitiveType();
|
||||
switch (primitiveType.getPrimitiveTypeName()) {
|
||||
case BOOLEAN:
|
||||
recordConsumer.addBoolean(value.asBoolean());
|
||||
break;
|
||||
case FLOAT:
|
||||
recordConsumer.addFloat(value.asDouble().floatValue());
|
||||
break;
|
||||
case DOUBLE:
|
||||
recordConsumer.addDouble(value.asDouble());
|
||||
break;
|
||||
case INT32:
|
||||
OriginalType originalType = type.getOriginalType();
|
||||
if (OriginalType.DATE.equals(originalType)) {
|
||||
int realVal = (int) (new java.sql.Date(value.asLong()).toLocalDate().toEpochDay());
|
||||
recordConsumer.addInteger(realVal);
|
||||
} else {
|
||||
recordConsumer.addInteger(value.asLong().intValue());
|
||||
}
|
||||
break;
|
||||
case INT64:
|
||||
recordConsumer.addLong(value.asLong());
|
||||
break;
|
||||
case INT96:
|
||||
recordConsumer.addBinary(timestampColToBinary(value));
|
||||
break;
|
||||
case BINARY:
|
||||
String valueAsString2Write = null;
|
||||
if (Column.Type.DATE == value.getType() && null != this.dateParse) {
|
||||
valueAsString2Write = dateParse.format(value.asDate());
|
||||
} else {
|
||||
valueAsString2Write = value.asString();
|
||||
}
|
||||
recordConsumer.addBinary(Binary.fromString(valueAsString2Write));
|
||||
break;
|
||||
case FIXED_LEN_BYTE_ARRAY:
|
||||
if (primitiveType.getDecimalMetadata() != null) {
|
||||
// decimal
|
||||
recordConsumer.addBinary(decimalToBinary(value, primitiveType.getDecimalMetadata().getPrecision(), primitiveType.getDecimalMetadata().getScale()));
|
||||
break;
|
||||
}
|
||||
/* fall through */
|
||||
default:
|
||||
recordConsumer.addBinary(Binary.fromString(value.asString()));
|
||||
break;
|
||||
}
|
||||
recordConsumer.endField(type.getName(), index);
|
||||
}
|
||||
}
|
@ -167,7 +167,7 @@ public class BaseWriter {
|
||||
if (null != renderedPreSqls && !renderedPreSqls.isEmpty()) {
|
||||
// 说明有 preSql 配置,则此处删除掉
|
||||
originalConfig.remove(Key.PRE_SQL);
|
||||
String tempJdbcUrl = jdbcUrl.replace("postgresql", "hologres");
|
||||
String tempJdbcUrl = jdbcUrl.replace("jdbc:postgresql://", "jdbc:hologres://");
|
||||
try (Connection conn = DriverManager.getConnection(
|
||||
tempJdbcUrl, username, password)) {
|
||||
LOG.info("Begin to execute preSqls:[{}]. context info:{}.",
|
||||
@ -191,32 +191,34 @@ public class BaseWriter {
|
||||
// 一般来说,是需要推迟到 task 中进行post 的执行(单表情况例外)
|
||||
public void post(Configuration originalConfig) {
|
||||
|
||||
String username = originalConfig.getString(Key.USERNAME);
|
||||
String password = originalConfig.getString(Key.PASSWORD);
|
||||
try {
|
||||
String username = originalConfig.getString(Key.USERNAME);
|
||||
String password = originalConfig.getString(Key.PASSWORD);
|
||||
|
||||
String jdbcUrl = originalConfig.getString(Key.JDBC_URL);
|
||||
String jdbcUrl = originalConfig.getString(Key.JDBC_URL);
|
||||
|
||||
String table = originalConfig.getString(Key.TABLE);
|
||||
String table = originalConfig.getString(Key.TABLE);
|
||||
|
||||
List<String> postSqls = originalConfig.getList(Key.POST_SQL,
|
||||
String.class);
|
||||
List<String> renderedPostSqls = WriterUtil.renderPreOrPostSqls(
|
||||
postSqls, table);
|
||||
List<String> postSqls = originalConfig.getList(Key.POST_SQL,
|
||||
String.class);
|
||||
List<String> renderedPostSqls = WriterUtil.renderPreOrPostSqls(
|
||||
postSqls, table);
|
||||
|
||||
if (null != renderedPostSqls && !renderedPostSqls.isEmpty()) {
|
||||
// 说明有 postSql 配置,则此处删除掉
|
||||
originalConfig.remove(Key.POST_SQL);
|
||||
String tempJdbcUrl = jdbcUrl.replace("postgresql", "hologres");
|
||||
Connection conn = DBUtil.getConnection(this.dataBaseType,
|
||||
tempJdbcUrl, username, password);
|
||||
|
||||
LOG.info(
|
||||
"Begin to execute postSqls:[{}]. context info:{}.",
|
||||
StringUtils.join(renderedPostSqls, ";"), tempJdbcUrl);
|
||||
WriterUtil.executeSqls(conn, renderedPostSqls, tempJdbcUrl, dataBaseType);
|
||||
DBUtil.closeDBResources(null, null, conn);
|
||||
if (null != renderedPostSqls && !renderedPostSqls.isEmpty()) {
|
||||
// 说明有 postSql 配置,则此处删除掉
|
||||
originalConfig.remove(Key.POST_SQL);
|
||||
String tempJdbcUrl = jdbcUrl.replace("jdbc:postgresql://", "jdbc:hologres://");
|
||||
try (Connection conn = DriverManager.getConnection(
|
||||
tempJdbcUrl, username, password)) {
|
||||
LOG.info(
|
||||
"Begin to execute postSqls:[{}]. context info:{}.",
|
||||
StringUtils.join(renderedPostSqls, ";"), tempJdbcUrl);
|
||||
WriterUtil.executeSqls(conn, renderedPostSqls, tempJdbcUrl, dataBaseType);
|
||||
}
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
throw DataXException.asDataXException(DBUtilErrorCode.SQL_EXECUTE_FAIL, e);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public void destroy(Configuration originalConfig) {
|
||||
|
@ -56,6 +56,16 @@
|
||||
</dependencies>
|
||||
|
||||
<build>
|
||||
<resources>
|
||||
<resource>
|
||||
<directory>src/main/resources</directory>
|
||||
<includes>
|
||||
<include>**/*.*</include>
|
||||
</includes>
|
||||
<filtering>true</filtering>
|
||||
</resource>
|
||||
</resources>
|
||||
|
||||
<plugins>
|
||||
<!-- compiler plugin -->
|
||||
<plugin>
|
||||
|
@ -18,6 +18,7 @@ public class Neo4jWriter extends Writer {
|
||||
@Override
|
||||
public void init() {
|
||||
LOGGER.info("Neo4jWriter Job init success");
|
||||
this.jobConf = getPluginJobConf();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
51
neo4jwriter/src/test/resources/streamreader2neo4j.json
Normal file
51
neo4jwriter/src/test/resources/streamreader2neo4j.json
Normal file
@ -0,0 +1,51 @@
|
||||
{
|
||||
"job": {
|
||||
"content": [
|
||||
{
|
||||
"reader": {
|
||||
"name": "streamreader",
|
||||
"parameter": {
|
||||
"sliceRecordCount": 10,
|
||||
"column": [
|
||||
{
|
||||
"type": "string",
|
||||
"value": "StreamReader"
|
||||
},
|
||||
{
|
||||
"type": "string",
|
||||
"value": "1997"
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"writer": {
|
||||
"name": "neo4jWriter",
|
||||
"parameter": {
|
||||
"uri": "bolt://localhost:7687",
|
||||
"username":"neo4j",
|
||||
"password":"Test@12343",
|
||||
"database":"neo4j",
|
||||
"cypher": "unwind $batch as row CALL apoc.cypher.doIt( 'create (n:`' + row.Label + '`{id:$id})' ,{id: row.id} ) YIELD value RETURN 1 ",
|
||||
"batchDataVariableName": "batch",
|
||||
"batchSize": "3",
|
||||
"properties": [
|
||||
{
|
||||
"name": "Label",
|
||||
"type": "string"
|
||||
},
|
||||
{
|
||||
"name": "id",
|
||||
"type": "STRING"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"setting": {
|
||||
"speed": {
|
||||
"channel": 5
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
BIN
oceanbasev10reader/src/main/libs/oceanbase-client-1.1.10.jar
Normal file
BIN
oceanbasev10reader/src/main/libs/oceanbase-client-1.1.10.jar
Normal file
Binary file not shown.
@ -64,8 +64,16 @@
|
||||
</exclusion>
|
||||
</exclusions>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.oceanbase</groupId>
|
||||
<artifactId>shade-ob-partition-calculator</artifactId>
|
||||
<version>1.0-SNAPSHOT</version>
|
||||
<scope>system</scope>
|
||||
<systemPath>${pom.basedir}/src/main/libs/shade-ob-partition-calculator-1.0-SNAPSHOT.jar</systemPath>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>log4j</groupId>
|
||||
<artifactId>log4j</artifactId>
|
||||
<version>1.2.16</version>
|
||||
|
@ -6,6 +6,7 @@ public interface Config {
|
||||
|
||||
double DEFAULT_MEMSTORE_THRESHOLD = 0.9d;
|
||||
|
||||
double DEFAULT_SLOW_MEMSTORE_THRESHOLD = 0.75d;
|
||||
String MEMSTORE_CHECK_INTERVAL_SECOND = "memstoreCheckIntervalSecond";
|
||||
|
||||
long DEFAULT_MEMSTORE_CHECK_INTERVAL_SECOND = 30;
|
||||
|
@ -0,0 +1,48 @@
|
||||
package com.alibaba.datax.plugin.writer.oceanbasev10writer.ext;
|
||||
|
||||
import com.alibaba.datax.common.util.Configuration;
|
||||
import com.alibaba.datax.plugin.rdbms.util.DBUtil;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.sql.Connection;
|
||||
|
||||
public abstract class AbstractConnHolder {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(AbstractConnHolder.class);
|
||||
|
||||
protected final Configuration config;
|
||||
protected Connection conn;
|
||||
|
||||
public AbstractConnHolder(Configuration config) {
|
||||
this.config = config;
|
||||
}
|
||||
|
||||
public abstract Connection initConnection();
|
||||
|
||||
public Configuration getConfig() {
|
||||
return config;
|
||||
}
|
||||
|
||||
public Connection getConn() {
|
||||
try {
|
||||
if (conn != null && !conn.isClosed()) {
|
||||
return conn;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
LOG.warn("judge connection is closed or not failed. try to reconnect.", e);
|
||||
}
|
||||
return reconnect();
|
||||
}
|
||||
|
||||
public Connection reconnect() {
|
||||
DBUtil.closeDBResources(null, conn);
|
||||
return initConnection();
|
||||
}
|
||||
|
||||
public abstract String getJdbcUrl();
|
||||
|
||||
public abstract String getUserName();
|
||||
|
||||
public abstract void destroy();
|
||||
}
|
@ -23,7 +23,7 @@ import org.slf4j.LoggerFactory;
|
||||
public class DataBaseWriterBuffer {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(DataBaseWriterBuffer.class);
|
||||
|
||||
private final ConnHolder connHolder;
|
||||
private final AbstractConnHolder connHolder;
|
||||
private final String dbName;
|
||||
private Map<String, LinkedList<Record>> tableBuffer = new HashMap<String, LinkedList<Record>>();
|
||||
private long lastCheckMemstoreTime;
|
||||
@ -33,7 +33,7 @@ public class DataBaseWriterBuffer {
|
||||
this.dbName=dbName;
|
||||
}
|
||||
|
||||
public ConnHolder getConnHolder(){
|
||||
public AbstractConnHolder getConnHolder(){
|
||||
return connHolder;
|
||||
}
|
||||
|
||||
|
@ -3,15 +3,13 @@ package com.alibaba.datax.plugin.writer.oceanbasev10writer.ext;
|
||||
import java.sql.Connection;
|
||||
|
||||
import com.alibaba.datax.common.util.Configuration;
|
||||
import com.alibaba.datax.plugin.rdbms.util.DBUtil;
|
||||
import com.alibaba.datax.plugin.rdbms.util.DataBaseType;
|
||||
|
||||
/**
|
||||
* wrap oceanbase java client
|
||||
* @author oceanbase
|
||||
*/
|
||||
|
||||
public class OCJConnHolder extends ConnHolder {
|
||||
public class OCJConnHolder extends AbstractConnHolder {
|
||||
private ServerConnectInfo connectInfo;
|
||||
private String dataSourceKey;
|
||||
|
||||
@ -28,17 +26,6 @@ public class OCJConnHolder extends ConnHolder {
|
||||
return conn;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Connection reconnect() {
|
||||
DBUtil.closeDBResources(null, conn);
|
||||
return initConnection();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Connection getConn() {
|
||||
return conn;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getJdbcUrl() {
|
||||
return connectInfo.jdbcUrl;
|
||||
|
@ -16,7 +16,7 @@ import com.alibaba.datax.plugin.writer.oceanbasev10writer.util.ObWriterUtils;
|
||||
* @author oceanbase
|
||||
*
|
||||
*/
|
||||
public class ObClientConnHolder extends ConnHolder {
|
||||
public class ObClientConnHolder extends AbstractConnHolder {
|
||||
private final String jdbcUrl;
|
||||
private final String userName;
|
||||
private final String password;
|
||||
|
@ -1,5 +1,7 @@
|
||||
package com.alibaba.datax.plugin.writer.oceanbasev10writer.ext;
|
||||
|
||||
import static org.apache.commons.lang3.StringUtils.EMPTY;
|
||||
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
@ -12,40 +14,19 @@ public class ServerConnectInfo {
|
||||
public String databaseName;
|
||||
public String ipPort;
|
||||
public String jdbcUrl;
|
||||
public boolean publicCloud;
|
||||
|
||||
/**
|
||||
*
|
||||
* @param jdbcUrl format is jdbc:oceanbase//ip:port
|
||||
* @param username format is cluster:tenant:username or username@tenant#cluster or user@tenant or user
|
||||
* @param password
|
||||
*/
|
||||
public ServerConnectInfo(final String jdbcUrl, final String username, final String password) {
|
||||
if (jdbcUrl.startsWith(com.alibaba.datax.plugin.rdbms.writer.Constant.OB10_SPLIT_STRING)) {
|
||||
String[] ss = jdbcUrl.split(com.alibaba.datax.plugin.rdbms.writer.Constant.OB10_SPLIT_STRING_PATTERN);
|
||||
if (ss.length != 3) {
|
||||
throw new RuntimeException("jdbc url format is not correct: " + jdbcUrl);
|
||||
}
|
||||
this.userName = username;
|
||||
this.clusterName = ss[1].trim().split(":")[0];
|
||||
this.tenantName = ss[1].trim().split(":")[1];
|
||||
this.jdbcUrl = ss[2].replace("jdbc:mysql:", "jdbc:oceanbase:");
|
||||
} else {
|
||||
this.jdbcUrl = jdbcUrl.replace("jdbc:mysql:", "jdbc:oceanbase:");
|
||||
if (username.contains("@") && username.contains("#")) {
|
||||
this.userName = username.substring(0, username.indexOf("@"));
|
||||
this.tenantName = username.substring(username.indexOf("@") + 1, username.indexOf("#"));
|
||||
this.clusterName = username.substring(username.indexOf("#") + 1);
|
||||
} else if (username.contains(":")) {
|
||||
String[] config = username.split(":");
|
||||
if (config.length != 3) {
|
||||
throw new RuntimeException ("username format is not correct: " + username);
|
||||
}
|
||||
this.clusterName = config[0];
|
||||
this.tenantName = config[1];
|
||||
this.userName = config[2];
|
||||
} else {
|
||||
this.clusterName = null;
|
||||
this.tenantName = null;
|
||||
this.userName = username;
|
||||
}
|
||||
}
|
||||
|
||||
this.jdbcUrl = jdbcUrl;
|
||||
this.password = password;
|
||||
parseJdbcUrl(jdbcUrl);
|
||||
parseFullUserName(username);
|
||||
}
|
||||
|
||||
private void parseJdbcUrl(final String jdbcUrl) {
|
||||
@ -56,11 +37,42 @@ public class ServerConnectInfo {
|
||||
String dbName = matcher.group(2);
|
||||
this.ipPort = ipPort;
|
||||
this.databaseName = dbName;
|
||||
this.publicCloud = ipPort.split(":")[0].endsWith("aliyuncs.com");
|
||||
} else {
|
||||
throw new RuntimeException("Invalid argument:" + jdbcUrl);
|
||||
}
|
||||
}
|
||||
|
||||
private void parseFullUserName(final String fullUserName) {
|
||||
int tenantIndex = fullUserName.indexOf("@");
|
||||
int clusterIndex = fullUserName.indexOf("#");
|
||||
if (fullUserName.contains(":") && tenantIndex < 0) {
|
||||
String[] names = fullUserName.split(":");
|
||||
if (names.length != 3) {
|
||||
throw new RuntimeException("invalid argument: " + fullUserName);
|
||||
} else {
|
||||
this.clusterName = names[0];
|
||||
this.tenantName = names[1];
|
||||
this.userName = names[2];
|
||||
}
|
||||
} else if (!publicCloud || tenantIndex < 0) {
|
||||
this.userName = tenantIndex < 0 ? fullUserName : fullUserName.substring(0, tenantIndex);
|
||||
this.clusterName = clusterIndex < 0 ? EMPTY : fullUserName.substring(clusterIndex + 1);
|
||||
this.tenantName = tenantIndex < 0 ? EMPTY : fullUserName.substring(tenantIndex + 1, clusterIndex);
|
||||
} else {
|
||||
// If in public cloud, the username with format user@tenant#cluster should be parsed, otherwise, connection can't be created.
|
||||
this.userName = fullUserName.substring(0, tenantIndex);
|
||||
if (clusterIndex > tenantIndex) {
|
||||
this.tenantName = fullUserName.substring(tenantIndex + 1, clusterIndex);
|
||||
this.clusterName = fullUserName.substring(clusterIndex + 1);
|
||||
} else {
|
||||
this.tenantName = fullUserName.substring(tenantIndex + 1);
|
||||
this.clusterName = EMPTY;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
StringBuffer strBuffer = new StringBuffer();
|
||||
return strBuffer.append("clusterName:").append(clusterName).append(", tenantName:").append(tenantName)
|
||||
@ -69,11 +81,18 @@ public class ServerConnectInfo {
|
||||
}
|
||||
|
||||
public String getFullUserName() {
|
||||
StringBuilder builder = new StringBuilder(userName);
|
||||
if (tenantName != null && clusterName != null) {
|
||||
builder.append("@").append(tenantName).append("#").append(clusterName);
|
||||
StringBuilder builder = new StringBuilder();
|
||||
builder.append(userName);
|
||||
if (!EMPTY.equals(tenantName)) {
|
||||
builder.append("@").append(tenantName);
|
||||
}
|
||||
|
||||
if (!EMPTY.equals(clusterName)) {
|
||||
builder.append("#").append(clusterName);
|
||||
}
|
||||
if (EMPTY.equals(this.clusterName) && EMPTY.equals(this.tenantName)) {
|
||||
return this.userName;
|
||||
}
|
||||
return builder.toString();
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,19 @@
|
||||
package com.alibaba.datax.plugin.writer.oceanbasev10writer.part;
|
||||
|
||||
import com.alibaba.datax.common.element.Record;
|
||||
|
||||
/**
|
||||
* @author cjyyz
|
||||
* @date 2023/02/07
|
||||
* @since
|
||||
*/
|
||||
public interface IObPartCalculator {
|
||||
|
||||
/**
|
||||
* 计算 Partition Id
|
||||
*
|
||||
* @param record
|
||||
* @return Long
|
||||
*/
|
||||
Long calculate(Record record);
|
||||
}
|
@ -0,0 +1,109 @@
|
||||
package com.alibaba.datax.plugin.writer.oceanbasev10writer.part;
|
||||
|
||||
import com.alibaba.datax.common.element.Record;
|
||||
import com.alibaba.datax.plugin.writer.oceanbasev10writer.ext.ServerConnectInfo;
|
||||
import com.alipay.oceanbase.obproxy.data.TableEntryKey;
|
||||
import com.alipay.oceanbase.obproxy.util.ObPartitionIdCalculator;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
* OceanBase 1.x和2.x的分区计算
|
||||
*
|
||||
* @author cjyyz
|
||||
* @date 2023/02/07
|
||||
* @since
|
||||
*/
|
||||
public class ObPartitionCalculatorV1 implements IObPartCalculator {
|
||||
|
||||
private static final Logger LOG = LoggerFactory.getLogger(ObPartitionCalculatorV1.class);
|
||||
|
||||
/**
|
||||
* 分区键的位置
|
||||
*/
|
||||
private List<Integer> partIndexes;
|
||||
|
||||
/**
|
||||
* 表的全部字段名
|
||||
*/
|
||||
private List<String> columnNames;
|
||||
|
||||
/**
|
||||
* ocj partition calculator
|
||||
*/
|
||||
private ObPartitionIdCalculator calculator;
|
||||
|
||||
/**
|
||||
* @param connectInfo
|
||||
* @param table
|
||||
* @param columns
|
||||
*/
|
||||
public ObPartitionCalculatorV1(ServerConnectInfo connectInfo, String table, List<String> columns) {
|
||||
|
||||
initCalculator(connectInfo, table);
|
||||
|
||||
if (Objects.isNull(calculator)) {
|
||||
LOG.warn("partCalculator is null");
|
||||
return;
|
||||
}
|
||||
|
||||
this.partIndexes = new ArrayList<>(columns.size());
|
||||
this.columnNames = new ArrayList<>(columns);
|
||||
|
||||
for (int i = 0; i < columns.size(); ++i) {
|
||||
String columnName = columns.get(i);
|
||||
if (calculator.isPartitionKeyColumn(columnName)) {
|
||||
LOG.info(columnName + " is partition key.");
|
||||
partIndexes.add(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param record
|
||||
* @return Long
|
||||
*/
|
||||
@Override
|
||||
public Long calculate(Record record) {
|
||||
if (Objects.isNull(calculator)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
for (Integer i : partIndexes) {
|
||||
calculator.addColumn(columnNames.get(i), record.getColumn(i).asString());
|
||||
}
|
||||
return calculator.calculate();
|
||||
}
|
||||
|
||||
/**
|
||||
* @param connectInfo
|
||||
* @param table
|
||||
*/
|
||||
private void initCalculator(ServerConnectInfo connectInfo, String table) {
|
||||
|
||||
LOG.info(String.format("create tableEntryKey with clusterName %s, tenantName %s, databaseName %s, tableName %s",
|
||||
connectInfo.clusterName, connectInfo.tenantName, connectInfo.databaseName, table));
|
||||
TableEntryKey tableEntryKey = new TableEntryKey(connectInfo.clusterName, connectInfo.tenantName,
|
||||
connectInfo.databaseName, table);
|
||||
|
||||
int retry = 0;
|
||||
|
||||
do {
|
||||
try {
|
||||
if (retry > 0) {
|
||||
TimeUnit.SECONDS.sleep(1);
|
||||
LOG.info("retry create new part calculator {} times", retry);
|
||||
}
|
||||
LOG.info("create partCalculator with address: " + connectInfo.ipPort);
|
||||
calculator = new ObPartitionIdCalculator(connectInfo.ipPort, tableEntryKey);
|
||||
} catch (Exception ex) {
|
||||
++retry;
|
||||
LOG.warn("create new part calculator failed, retry: {}", ex.getMessage());
|
||||
}
|
||||
} while (calculator == null && retry < 3);
|
||||
}
|
||||
}
|
@ -0,0 +1,169 @@
|
||||
package com.alibaba.datax.plugin.writer.oceanbasev10writer.part;
|
||||
|
||||
import com.alibaba.datax.common.element.Record;
|
||||
import com.alibaba.datax.plugin.rdbms.util.DBUtil;
|
||||
import com.alibaba.datax.plugin.rdbms.util.DataBaseType;
|
||||
import com.alibaba.datax.plugin.writer.oceanbasev10writer.ext.ServerConnectInfo;
|
||||
import com.alibaba.datax.plugin.writer.oceanbasev10writer.util.DbUtils;
|
||||
import com.oceanbase.partition.calculator.ObPartIdCalculator;
|
||||
import com.oceanbase.partition.calculator.enums.ObPartLevel;
|
||||
import com.oceanbase.partition.calculator.enums.ObServerMode;
|
||||
import com.oceanbase.partition.calculator.helper.TableEntryExtractor;
|
||||
import com.oceanbase.partition.calculator.model.TableEntry;
|
||||
import com.oceanbase.partition.calculator.model.TableEntryKey;
|
||||
import com.oceanbase.partition.calculator.model.Version;
|
||||
import com.oceanbase.partition.metadata.desc.ObPartColumn;
|
||||
import com.oceanbase.partition.metadata.desc.ObTablePart;
|
||||
import java.sql.Connection;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
* OceanBase 3.x和4.x的分区计算
|
||||
*
|
||||
* @author cjyyz
|
||||
* @date 2023/02/07
|
||||
* @since
|
||||
*/
|
||||
public class ObPartitionCalculatorV2 implements IObPartCalculator {
|
||||
|
||||
private static final Logger LOG = LoggerFactory.getLogger(ObPartitionCalculatorV2.class);
|
||||
|
||||
/**
|
||||
* OB的模式以及版本信息
|
||||
*/
|
||||
private ObServerMode mode;
|
||||
|
||||
/**
|
||||
* ob-partition-calculator 分区计算组件
|
||||
*/
|
||||
private ObPartIdCalculator calculator;
|
||||
|
||||
/**
|
||||
* 记录columns的字段名和在record中的位置。
|
||||
* 当目标表结构的分区键是生成列时,calculator 需要从改结构中获取到生成列所依赖的字段的值
|
||||
* e.g.
|
||||
* create table t1 (
|
||||
* c1 varchar(20),
|
||||
* c2 varchar(20) generated always as (substr(`c1`,1,8))
|
||||
* )partition by key(c2) partitions 5
|
||||
*
|
||||
* 此时,columnNameIndexMap包含的元素是 c1:0
|
||||
* 需要将c1字段的值从columnNameIndexMap中添加到{@link com.oceanbase.partition.calculator.ObPartIdCalculator#getRefColumnValues()}
|
||||
*/
|
||||
private Map<String, Integer> columnNameIndexMap;
|
||||
|
||||
/**
|
||||
* @param connectInfo
|
||||
* @param table
|
||||
* @param mode
|
||||
*/
|
||||
public ObPartitionCalculatorV2(ServerConnectInfo connectInfo, String table, ObServerMode mode, List<String> columns) {
|
||||
this.mode = mode;
|
||||
this.columnNameIndexMap = new HashMap<>();
|
||||
for (int i = 0; i < columns.size(); i++) {
|
||||
columnNameIndexMap.put(columns.get(i).toLowerCase(), i);
|
||||
}
|
||||
initCalculator(connectInfo, table);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param record
|
||||
* @return Long
|
||||
*/
|
||||
@Override
|
||||
public Long calculate(Record record) {
|
||||
if (Objects.isNull(calculator)) {
|
||||
return null;
|
||||
}
|
||||
if (!calculator.getTableEntry().isPartitionTable()) {
|
||||
return 0L;
|
||||
}
|
||||
return calculator.calculatePartId(filterNullableColumns(record));
|
||||
}
|
||||
|
||||
/**
|
||||
* 初始化分区计算组件
|
||||
*
|
||||
* @param connectInfo
|
||||
* @param table
|
||||
*/
|
||||
private void initCalculator(ServerConnectInfo connectInfo, String table) {
|
||||
TableEntryKey tableEntryKey = new TableEntryKey(connectInfo.clusterName, connectInfo.tenantName, connectInfo.databaseName, table, mode);
|
||||
boolean subsequentFromV4 = !mode.getVersion().isOlderThan(new Version("4.0.0.0"));
|
||||
try {
|
||||
TableEntry tableEntry;
|
||||
try (Connection conn = getConnection(connectInfo, subsequentFromV4)){
|
||||
TableEntryExtractor extractor = new TableEntryExtractor();
|
||||
tableEntry = extractor.queryTableEntry(conn, tableEntryKey,subsequentFromV4);
|
||||
}
|
||||
this.calculator = new ObPartIdCalculator(false, tableEntry, subsequentFromV4);
|
||||
} catch (Exception e) {
|
||||
LOG.warn("create new part calculator failed. reason: {}", e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
private Connection getConnection(ServerConnectInfo connectInfo, boolean subsequentFromV4) throws Exception {
|
||||
// OceanBase 4.0.0.0及之后版本均使用业务租户连接计算分区
|
||||
if (subsequentFromV4) {
|
||||
return DBUtil.getConnection(DataBaseType.OceanBase, connectInfo.jdbcUrl, connectInfo.getFullUserName(), connectInfo.password);
|
||||
}
|
||||
// OceanBase 4.0.0.0之前版本使用sys租户连接计算分区
|
||||
return DbUtils.buildSysConn(connectInfo.jdbcUrl, connectInfo.clusterName);
|
||||
}
|
||||
|
||||
/**
|
||||
* 只选择分区字段值传入分区计算组件
|
||||
*
|
||||
* @param record
|
||||
* @return Object[]
|
||||
*/
|
||||
private Object[] filterNullableColumns(Record record) {
|
||||
final ObTablePart tablePart = calculator.getTableEntry().getTablePart();
|
||||
|
||||
final Object[] filteredRecords = new Object[record.getColumnNumber()];
|
||||
|
||||
if (tablePart.getLevel().getIndex() > ObPartLevel.LEVEL_ZERO.getIndex()) {
|
||||
// 从record中添加非生成列的一级分区值到filteredRecords数组中
|
||||
for (ObPartColumn partColumn : tablePart.getPartColumns()) {
|
||||
if (partColumn.getColumnExpr() == null) {
|
||||
int metaIndex = partColumn.getColumnIndex();
|
||||
String columnName = partColumn.getColumnName().toLowerCase();
|
||||
int idxInRecord = columnNameIndexMap.get(columnName);
|
||||
filteredRecords[metaIndex] = record.getColumn(idxInRecord).asString();
|
||||
}
|
||||
|
||||
}
|
||||
// 从record中添加生成列的一级分区值到calculator的redColumnMap中,ObTablePart.getRefPartColumns中的字段名均为小写
|
||||
for (ObPartColumn partColumn : tablePart.getRefPartColumns()) {
|
||||
String columnName = partColumn.getColumnName();
|
||||
int index = columnNameIndexMap.get(columnName);
|
||||
calculator.addRefColumn(columnName, record.getColumn(index).asString());
|
||||
}
|
||||
}
|
||||
|
||||
if (tablePart.getLevel().getIndex() >= ObPartLevel.LEVEL_TWO.getIndex()) {
|
||||
// 从record中添加非生成列的二级分区值到filteredRecords数组中
|
||||
for (ObPartColumn partColumn : tablePart.getSubPartColumns()) {
|
||||
if (partColumn.getColumnExpr() == null) {
|
||||
int metaIndex = partColumn.getColumnIndex();
|
||||
String columnName = partColumn.getColumnName().toLowerCase();
|
||||
int idxInRecord = columnNameIndexMap.get(columnName);
|
||||
filteredRecords[metaIndex] = record.getColumn(idxInRecord).asString();
|
||||
}
|
||||
|
||||
}
|
||||
// 从record中添加生成列的二级分区值到calculator的redColumnMap中,ObTablePart.getRefSubPartColumns中的字段名均为小写
|
||||
for (ObPartColumn partColumn : tablePart.getRefSubPartColumns()) {
|
||||
String columnName = partColumn.getColumnName();
|
||||
int index = columnNameIndexMap.get(columnName);
|
||||
calculator.addRefColumn(columnName, record.getColumn(index).asString());
|
||||
}
|
||||
}
|
||||
return filteredRecords;
|
||||
}
|
||||
}
|
@ -1,6 +1,5 @@
|
||||
package com.alibaba.datax.plugin.writer.oceanbasev10writer.task;
|
||||
|
||||
import com.alibaba.datax.common.element.Column;
|
||||
import com.alibaba.datax.common.element.Record;
|
||||
import com.alibaba.datax.common.exception.DataXException;
|
||||
import com.alibaba.datax.common.plugin.RecordReceiver;
|
||||
@ -11,16 +10,14 @@ import com.alibaba.datax.plugin.rdbms.util.DBUtilErrorCode;
|
||||
import com.alibaba.datax.plugin.rdbms.util.DataBaseType;
|
||||
import com.alibaba.datax.plugin.rdbms.writer.CommonRdbmsWriter;
|
||||
import com.alibaba.datax.plugin.writer.oceanbasev10writer.Config;
|
||||
import com.alibaba.datax.plugin.writer.oceanbasev10writer.ext.ConnHolder;
|
||||
import com.alibaba.datax.plugin.writer.oceanbasev10writer.ext.AbstractConnHolder;
|
||||
import com.alibaba.datax.plugin.writer.oceanbasev10writer.ext.ObClientConnHolder;
|
||||
import com.alibaba.datax.plugin.writer.oceanbasev10writer.ext.ServerConnectInfo;
|
||||
import com.alibaba.datax.plugin.writer.oceanbasev10writer.part.IObPartCalculator;
|
||||
import com.alibaba.datax.plugin.writer.oceanbasev10writer.part.ObPartitionCalculatorV1;
|
||||
import com.alibaba.datax.plugin.writer.oceanbasev10writer.part.ObPartitionCalculatorV2;
|
||||
import com.alibaba.datax.plugin.writer.oceanbasev10writer.util.ObWriterUtils;
|
||||
import com.alipay.oceanbase.obproxy.data.TableEntryKey;
|
||||
import com.alipay.oceanbase.obproxy.util.ObPartitionIdCalculator;
|
||||
import org.apache.commons.lang3.tuple.Pair;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.oceanbase.partition.calculator.enums.ObServerMode;
|
||||
import java.sql.Connection;
|
||||
import java.sql.PreparedStatement;
|
||||
import java.sql.SQLException;
|
||||
@ -35,8 +32,12 @@ import java.util.concurrent.atomic.AtomicLong;
|
||||
import java.util.concurrent.locks.Condition;
|
||||
import java.util.concurrent.locks.Lock;
|
||||
import java.util.concurrent.locks.ReentrantLock;
|
||||
|
||||
//import java.sql.PreparedStatement;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import static com.alibaba.datax.plugin.writer.oceanbasev10writer.Config.DEFAULT_SLOW_MEMSTORE_THRESHOLD;
|
||||
import static com.alibaba.datax.plugin.writer.oceanbasev10writer.util.ObWriterUtils.LoadMode.FAST;
|
||||
import static com.alibaba.datax.plugin.writer.oceanbasev10writer.util.ObWriterUtils.LoadMode.PAUSE;
|
||||
import static com.alibaba.datax.plugin.writer.oceanbasev10writer.util.ObWriterUtils.LoadMode.SLOW;
|
||||
|
||||
public class ConcurrentTableWriterTask extends CommonRdbmsWriter.Task {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(ConcurrentTableWriterTask.class);
|
||||
@ -47,41 +48,31 @@ public class ConcurrentTableWriterTask extends CommonRdbmsWriter.Task {
|
||||
private long memstoreCheckIntervalSecond = Config.DEFAULT_MEMSTORE_CHECK_INTERVAL_SECOND;
|
||||
// 最后一次检查
|
||||
private long lastCheckMemstoreTime;
|
||||
|
||||
private volatile ObWriterUtils.LoadMode loadMode = FAST;
|
||||
|
||||
private static AtomicLong totalTask = new AtomicLong(0);
|
||||
private long taskId = -1;
|
||||
|
||||
private AtomicBoolean isMemStoreFull = new AtomicBoolean(false);
|
||||
private ConnHolder checkConnHolder;
|
||||
private HashMap<Long, List<Record>> groupInsertValues;
|
||||
private IObPartCalculator obPartCalculator;
|
||||
private ConcurrentTableWriter concurrentWriter = null;
|
||||
private AbstractConnHolder connHolder;
|
||||
private boolean allTaskInQueue = false;
|
||||
private Lock lock = new ReentrantLock();
|
||||
private Condition condition = lock.newCondition();
|
||||
private long startTime;
|
||||
private String obWriteMode = "update";
|
||||
private boolean isOracleCompatibleMode = false;
|
||||
private String obUpdateColumns = null;
|
||||
private String dbName;
|
||||
private int calPartFailedCount = 0;
|
||||
|
||||
public ConcurrentTableWriterTask(DataBaseType dataBaseType) {
|
||||
public ConcurrentTableWriterTask(DataBaseType dataBaseType) {
|
||||
super(dataBaseType);
|
||||
taskId = totalTask.getAndIncrement();
|
||||
}
|
||||
|
||||
private ObPartitionIdCalculator partCalculator = null;
|
||||
|
||||
private HashMap<Long, List<Record>> groupInsertValues;
|
||||
List<Record> unknownPartRecords = new ArrayList<Record>();
|
||||
// private List<Record> unknownPartRecords;
|
||||
private List<Integer> partitionKeyIndexes;
|
||||
|
||||
private ConcurrentTableWriter concurrentWriter = null;
|
||||
|
||||
private ConnHolder connHolder;
|
||||
|
||||
private boolean allTaskInQueue = false;
|
||||
|
||||
private Lock lock = new ReentrantLock();
|
||||
private Condition condition = lock.newCondition();
|
||||
|
||||
private long startTime;
|
||||
private String obWriteMode = "update";
|
||||
private boolean isOracleCompatibleMode = false;
|
||||
private String obUpdateColumns = null;
|
||||
private List<Pair<String, int[]>> deleteColPos;
|
||||
private String dbName;
|
||||
|
||||
@Override
|
||||
public void init(Configuration config) {
|
||||
super.init(config);
|
||||
@ -95,15 +86,11 @@ public class ConcurrentTableWriterTask extends CommonRdbmsWriter.Task {
|
||||
this.memstoreThreshold = config.getDouble(Config.MEMSTORE_THRESHOLD, Config.DEFAULT_MEMSTORE_THRESHOLD);
|
||||
this.memstoreCheckIntervalSecond = config.getLong(Config.MEMSTORE_CHECK_INTERVAL_SECOND,
|
||||
Config.DEFAULT_MEMSTORE_CHECK_INTERVAL_SECOND);
|
||||
this.isOracleCompatibleMode = ObWriterUtils.isOracleMode();
|
||||
|
||||
LOG.info("configure url is unavailable, use obclient for connections.");
|
||||
this.checkConnHolder = new ObClientConnHolder(config, connectInfo.jdbcUrl,
|
||||
this.connHolder = new ObClientConnHolder(config, connectInfo.jdbcUrl,
|
||||
connectInfo.getFullUserName(), connectInfo.password);
|
||||
this.connHolder = new ObClientConnHolder(config, connectInfo.jdbcUrl,
|
||||
connectInfo.getFullUserName(), connectInfo.password);
|
||||
checkConnHolder.initConnection();
|
||||
if (isOracleCompatibleMode) {
|
||||
this.isOracleCompatibleMode = ObWriterUtils.isOracleMode();
|
||||
if (isOracleCompatibleMode) {
|
||||
connectInfo.databaseName = connectInfo.databaseName.toUpperCase();
|
||||
//在转义的情况下不翻译
|
||||
if (!(table.startsWith("\"") && table.endsWith("\""))) {
|
||||
@ -115,43 +102,36 @@ public class ConcurrentTableWriterTask extends CommonRdbmsWriter.Task {
|
||||
}
|
||||
|
||||
if (config.getBool(Config.USE_PART_CALCULATOR, Config.DEFAULT_USE_PART_CALCULATOR)) {
|
||||
initPartCalculator(connectInfo);
|
||||
this.obPartCalculator = createPartitionCalculator(connectInfo, ObServerMode.from(config.getString(Config.OB_COMPATIBLE_MODE), config.getString(Config.OB_VERSION)));
|
||||
} else {
|
||||
LOG.info("Disable partition calculation feature.");
|
||||
}
|
||||
|
||||
obUpdateColumns = config.getString(Config.OB_UPDATE_COLUMNS, null);
|
||||
groupInsertValues = new HashMap<Long, List<Record>>();
|
||||
partitionKeyIndexes = new ArrayList<Integer>();
|
||||
rewriteSql();
|
||||
obUpdateColumns = config.getString(Config.OB_UPDATE_COLUMNS, null);
|
||||
groupInsertValues = new HashMap<Long, List<Record>>();
|
||||
rewriteSql();
|
||||
|
||||
if (null == concurrentWriter) {
|
||||
concurrentWriter = new ConcurrentTableWriter(config, connectInfo, writeRecordSql);
|
||||
allTaskInQueue = false;
|
||||
}
|
||||
}
|
||||
if (null == concurrentWriter) {
|
||||
concurrentWriter = new ConcurrentTableWriter(config, connectInfo, writeRecordSql);
|
||||
allTaskInQueue = false;
|
||||
}
|
||||
}
|
||||
|
||||
private void initPartCalculator(ServerConnectInfo connectInfo) {
|
||||
int retry = 0;
|
||||
LOG.info(String.format("create tableEntryKey with clusterName %s, tenantName %s, databaseName %s, tableName %s",
|
||||
connectInfo.clusterName, connectInfo.tenantName, connectInfo.databaseName, table));
|
||||
TableEntryKey tableEntryKey = new TableEntryKey(connectInfo.clusterName, connectInfo.tenantName,
|
||||
connectInfo.databaseName, table);
|
||||
do {
|
||||
try {
|
||||
if (retry > 0) {
|
||||
int sleep = retry > 8 ? 500 : (1 << retry);
|
||||
TimeUnit.SECONDS.sleep(sleep);
|
||||
LOG.info("retry create new part calculator, the {} times", retry);
|
||||
}
|
||||
LOG.info("create partCalculator with address: " + connectInfo.ipPort);
|
||||
partCalculator = new ObPartitionIdCalculator(connectInfo.ipPort, tableEntryKey);
|
||||
} catch (Exception ex) {
|
||||
++retry;
|
||||
LOG.warn("create new part calculator failed, retry {}: {}", retry, ex.getMessage());
|
||||
}
|
||||
} while (partCalculator == null && retry < 3); // try 3 times
|
||||
}
|
||||
/**
|
||||
* 创建需要的分区计算组件
|
||||
*
|
||||
* @param connectInfo
|
||||
* @return
|
||||
*/
|
||||
private IObPartCalculator createPartitionCalculator(ServerConnectInfo connectInfo, ObServerMode obServerMode) {
|
||||
if (obServerMode.isSubsequentFrom("3.0.0.0")) {
|
||||
LOG.info("oceanbase version is {}, use ob-partition-calculator to calculate partition Id.", obServerMode.getVersion());
|
||||
return new ObPartitionCalculatorV2(connectInfo, table, obServerMode, columns);
|
||||
}
|
||||
|
||||
LOG.info("oceanbase version is {}, use ocj to calculate partition Id.", obServerMode.getVersion());
|
||||
return new ObPartitionCalculatorV1(connectInfo, table, columns);
|
||||
}
|
||||
|
||||
public boolean isFinished() {
|
||||
return allTaskInQueue && concurrentWriter.checkFinish();
|
||||
@ -174,43 +154,18 @@ public class ConcurrentTableWriterTask extends CommonRdbmsWriter.Task {
|
||||
if (isOracleCompatibleMode && obWriteMode.equalsIgnoreCase("update")) {
|
||||
// change obWriteMode to insert so the insert statement will be generated.
|
||||
obWriteMode = "insert";
|
||||
deleteColPos = ObWriterUtils.buildDeleteSql(conn, dbName, table, columns);
|
||||
}
|
||||
this.writeRecordSql = ObWriterUtils.buildWriteSql(table, columns, conn, obWriteMode, obUpdateColumns);
|
||||
LOG.info("writeRecordSql :{}", this.writeRecordSql);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void prepare(Configuration writerSliceConfig) {
|
||||
super.prepare(writerSliceConfig);
|
||||
calPartitionKeyIndex(partitionKeyIndexes);
|
||||
concurrentWriter.start();
|
||||
}
|
||||
|
||||
private void calPartitionKeyIndex(List<Integer> partKeyIndexes) {
|
||||
partKeyIndexes.clear();
|
||||
if (null == partCalculator) {
|
||||
LOG.error("partCalculator is null");
|
||||
return;
|
||||
}
|
||||
for (int i = 0; i < columns.size(); ++i) {
|
||||
if (partCalculator.isPartitionKeyColumn(columns.get(i))) {
|
||||
LOG.info(columns.get(i) + " is partition key.");
|
||||
partKeyIndexes.add(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private Long calPartitionId(List<Integer> partKeyIndexes, Record record) {
|
||||
if (partCalculator == null) {
|
||||
return null;
|
||||
}
|
||||
for (Integer i : partKeyIndexes) {
|
||||
partCalculator.addColumn(columns.get(i), record.getColumn(i).asString());
|
||||
}
|
||||
return partCalculator.calculate();
|
||||
}
|
||||
|
||||
@Override
|
||||
@Override
|
||||
public void startWriteWithConnection(RecordReceiver recordReceiver, TaskPluginCollector taskPluginCollector, Connection connection) {
|
||||
this.taskPluginCollector = taskPluginCollector;
|
||||
|
||||
@ -271,21 +226,6 @@ public class ConcurrentTableWriterTask extends CommonRdbmsWriter.Task {
|
||||
return fillPreparedStatement(preparedStatement, record);
|
||||
}
|
||||
|
||||
public PreparedStatement fillStatementIndex(PreparedStatement preparedStatement,
|
||||
int prepIdx, int columnIndex, Column column) throws SQLException {
|
||||
int columnSqltype = this.resultSetMetaData.getMiddle().get(columnIndex);
|
||||
String typeName = this.resultSetMetaData.getRight().get(columnIndex);
|
||||
return fillPreparedStatementColumnType(preparedStatement, prepIdx, columnSqltype, typeName, column);
|
||||
}
|
||||
|
||||
public void collectDirtyRecord(Record record, SQLException e) {
|
||||
taskPluginCollector.collectDirtyRecord(record, e);
|
||||
}
|
||||
|
||||
public void insertOneRecord(Connection connection, List<Record> buffer) {
|
||||
doOneInsert(connection, buffer);
|
||||
}
|
||||
|
||||
private void addLeftRecords() {
|
||||
//不需要刷新Cache,已经是最后一批数据了
|
||||
for (List<Record> groupValues : groupInsertValues.values()) {
|
||||
@ -293,17 +233,16 @@ public class ConcurrentTableWriterTask extends CommonRdbmsWriter.Task {
|
||||
addRecordsToWriteQueue(groupValues);
|
||||
}
|
||||
}
|
||||
if (unknownPartRecords.size() > 0) {
|
||||
addRecordsToWriteQueue(unknownPartRecords);
|
||||
}
|
||||
}
|
||||
|
||||
private void addRecordToCache(final Record record) {
|
||||
Long partId =null;
|
||||
try {
|
||||
partId = calPartitionId(partitionKeyIndexes, record);
|
||||
partId = obPartCalculator == null ? Long.MAX_VALUE : obPartCalculator.calculate(record);
|
||||
} catch (Exception e1) {
|
||||
LOG.warn("fail to get partition id: " + e1.getMessage() + ", record: " + record);
|
||||
if (calPartFailedCount++ < 10) {
|
||||
LOG.warn("fail to get partition id: " + e1.getMessage() + ", record: " + record);
|
||||
}
|
||||
}
|
||||
|
||||
if (partId == null) {
|
||||
@ -311,24 +250,11 @@ public class ConcurrentTableWriterTask extends CommonRdbmsWriter.Task {
|
||||
partId = Long.MAX_VALUE;
|
||||
}
|
||||
|
||||
if (partId != null) {
|
||||
List<Record> groupValues = groupInsertValues.get(partId);
|
||||
if (groupValues == null) {
|
||||
groupValues = new ArrayList<Record>(batchSize);
|
||||
groupInsertValues.put(partId, groupValues);
|
||||
}
|
||||
groupValues.add(record);
|
||||
if (groupValues.size() >= batchSize) {
|
||||
groupValues = addRecordsToWriteQueue(groupValues);
|
||||
groupInsertValues.put(partId, groupValues);
|
||||
}
|
||||
} else {
|
||||
LOG.debug("add unknown part record {}", record);
|
||||
unknownPartRecords.add(record);
|
||||
if (unknownPartRecords.size() >= batchSize) {
|
||||
unknownPartRecords = addRecordsToWriteQueue(unknownPartRecords);
|
||||
}
|
||||
|
||||
List<Record> groupValues = groupInsertValues.computeIfAbsent(partId, k -> new ArrayList<Record>(batchSize));
|
||||
groupValues.add(record);
|
||||
if (groupValues.size() >= batchSize) {
|
||||
groupValues = addRecordsToWriteQueue(groupValues);
|
||||
groupInsertValues.put(partId, groupValues);
|
||||
}
|
||||
}
|
||||
|
||||
@ -354,15 +280,25 @@ public class ConcurrentTableWriterTask extends CommonRdbmsWriter.Task {
|
||||
return new ArrayList<Record>(batchSize);
|
||||
}
|
||||
private void checkMemStore() {
|
||||
Connection checkConn = checkConnHolder.reconnect();
|
||||
Connection checkConn = connHolder.getConn();
|
||||
try {
|
||||
if (checkConn == null || checkConn.isClosed()) {
|
||||
checkConn = connHolder.reconnect();
|
||||
}
|
||||
}catch (Exception e) {
|
||||
LOG.warn("Check connection is unusable");
|
||||
}
|
||||
|
||||
long now = System.currentTimeMillis();
|
||||
if (now - lastCheckMemstoreTime < 1000 * memstoreCheckIntervalSecond) {
|
||||
return;
|
||||
}
|
||||
boolean isFull = ObWriterUtils.isMemstoreFull(checkConn, memstoreThreshold);
|
||||
this.isMemStoreFull.set(isFull);
|
||||
if (isFull) {
|
||||
LOG.warn("OB memstore is full,sleep 30 seconds, threshold=" + memstoreThreshold);
|
||||
double memUsedRatio = ObWriterUtils.queryMemUsedRatio(checkConn);
|
||||
if (memUsedRatio >= DEFAULT_SLOW_MEMSTORE_THRESHOLD) {
|
||||
this.loadMode = memUsedRatio >= memstoreThreshold ? PAUSE : SLOW;
|
||||
LOG.info("Memstore used ration is {}. Load data {}", memUsedRatio, loadMode.name());
|
||||
}else {
|
||||
this.loadMode = FAST;
|
||||
}
|
||||
lastCheckMemstoreTime = now;
|
||||
}
|
||||
@ -370,21 +306,23 @@ public class ConcurrentTableWriterTask extends CommonRdbmsWriter.Task {
|
||||
public boolean isMemStoreFull() {
|
||||
return isMemStoreFull.get();
|
||||
}
|
||||
|
||||
public void printEveryTime() {
|
||||
long cost = System.currentTimeMillis() - startTime;
|
||||
if (cost > 10000) { //10s
|
||||
print();
|
||||
startTime = System.currentTimeMillis();
|
||||
}
|
||||
|
||||
public boolean isShouldPause() {
|
||||
return this.loadMode.equals(PAUSE);
|
||||
}
|
||||
|
||||
public boolean isShouldSlow() {
|
||||
return this.loadMode.equals(SLOW);
|
||||
}
|
||||
|
||||
public void print() {
|
||||
LOG.debug("Statistic total task {}, finished {}, queue Size {}",
|
||||
concurrentWriter.getTotalTaskCount(),
|
||||
concurrentWriter.getFinishTaskCount(),
|
||||
concurrentWriter.getTaskQueueSize());
|
||||
concurrentWriter.printStatistics();
|
||||
if (LOG.isDebugEnabled()) {
|
||||
LOG.debug("Statistic total task {}, finished {}, queue Size {}",
|
||||
concurrentWriter.getTotalTaskCount(),
|
||||
concurrentWriter.getFinishTaskCount(),
|
||||
concurrentWriter.getTaskQueueSize());
|
||||
concurrentWriter.printStatistics();
|
||||
}
|
||||
}
|
||||
|
||||
public void waitTaskFinish() {
|
||||
@ -417,8 +355,6 @@ public class ConcurrentTableWriterTask extends CommonRdbmsWriter.Task {
|
||||
}
|
||||
// 把本级持有的conn关闭掉
|
||||
DBUtil.closeDBResources(null, connHolder.getConn());
|
||||
DBUtil.closeDBResources(null, checkConnHolder.getConn());
|
||||
checkConnHolder.destroy();
|
||||
super.destroy(writerSliceConfig);
|
||||
}
|
||||
|
||||
@ -469,7 +405,7 @@ public class ConcurrentTableWriterTask extends CommonRdbmsWriter.Task {
|
||||
public synchronized void start() {
|
||||
for (int i = 0; i < threadCount; ++i) {
|
||||
LOG.info("start {} insert task.", (i+1));
|
||||
InsertTask insertTask = new InsertTask(taskId, queue, config, connectInfo, rewriteRecordSql, deleteColPos);
|
||||
InsertTask insertTask = new InsertTask(taskId, queue, config, connectInfo, rewriteRecordSql);
|
||||
insertTask.setWriterTask(ConcurrentTableWriterTask.this);
|
||||
insertTask.setWriter(this);
|
||||
insertTasks.add(insertTask);
|
||||
@ -495,7 +431,7 @@ public class ConcurrentTableWriterTask extends CommonRdbmsWriter.Task {
|
||||
public void addBatchRecords(final List<Record> records) throws InterruptedException {
|
||||
boolean isSucc = false;
|
||||
while (!isSucc) {
|
||||
isSucc = queue.offer(records, 5, TimeUnit.SECONDS);
|
||||
isSucc = queue.offer(records, 5, TimeUnit.MILLISECONDS);
|
||||
checkMemStore();
|
||||
}
|
||||
totalTaskCount.incrementAndGet();
|
||||
|
@ -1,286 +1,204 @@
|
||||
package com.alibaba.datax.plugin.writer.oceanbasev10writer.task;
|
||||
import java.sql.Connection;
|
||||
import java.sql.PreparedStatement;
|
||||
import java.sql.SQLException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Queue;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import com.alibaba.datax.common.exception.DataXException;
|
||||
import com.alibaba.datax.plugin.rdbms.util.DBUtil;
|
||||
import com.alibaba.datax.plugin.rdbms.util.DBUtilErrorCode;
|
||||
import com.alibaba.datax.plugin.writer.oceanbasev10writer.ext.ObClientConnHolder;
|
||||
import org.apache.commons.lang3.tuple.Pair;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.alibaba.datax.common.element.Record;
|
||||
import com.alibaba.datax.common.util.Configuration;
|
||||
import com.alibaba.datax.plugin.rdbms.util.DBUtil;
|
||||
import com.alibaba.datax.plugin.writer.oceanbasev10writer.Config;
|
||||
import com.alibaba.datax.plugin.writer.oceanbasev10writer.ext.ConnHolder;
|
||||
import com.alibaba.datax.plugin.writer.oceanbasev10writer.ext.AbstractConnHolder;
|
||||
import com.alibaba.datax.plugin.writer.oceanbasev10writer.ext.ObClientConnHolder;
|
||||
import com.alibaba.datax.plugin.writer.oceanbasev10writer.ext.ServerConnectInfo;
|
||||
import com.alibaba.datax.plugin.writer.oceanbasev10writer.task.ConcurrentTableWriterTask.ConcurrentTableWriter;
|
||||
import com.alibaba.datax.plugin.writer.oceanbasev10writer.util.ObWriterUtils;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.sql.Connection;
|
||||
import java.sql.PreparedStatement;
|
||||
import java.sql.SQLException;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.BlockingQueue;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
public class InsertTask implements Runnable {
|
||||
|
||||
private static final Logger LOG = LoggerFactory.getLogger(InsertTask.class);
|
||||
|
||||
private ConcurrentTableWriterTask writerTask;
|
||||
private ConcurrentTableWriter writer;
|
||||
private ConcurrentTableWriterTask writerTask;
|
||||
private ConcurrentTableWriter writer;
|
||||
|
||||
private String writeRecordSql;
|
||||
private long totalCost = 0;
|
||||
private long insertCount = 0;
|
||||
private String writeRecordSql;
|
||||
private long totalCost = 0;
|
||||
private long insertCount = 0;
|
||||
|
||||
private Queue<List<Record>> queue;
|
||||
private boolean isStop;
|
||||
private ConnHolder connHolder;
|
||||
private BlockingQueue<List<Record>> queue;
|
||||
private boolean isStop;
|
||||
private AbstractConnHolder connHolder;
|
||||
|
||||
private final long taskId;
|
||||
private ServerConnectInfo connInfo;
|
||||
private final long taskId;
|
||||
private ServerConnectInfo connInfo;
|
||||
|
||||
// 失败重试次数
|
||||
private int failTryCount = Config.DEFAULT_FAIL_TRY_COUNT;
|
||||
private boolean printCost = Config.DEFAULT_PRINT_COST;
|
||||
private long costBound = Config.DEFAULT_COST_BOUND;
|
||||
private List<Pair<String, int[]>> deleteMeta;
|
||||
// 失败重试次数
|
||||
private int failTryCount = Config.DEFAULT_FAIL_TRY_COUNT;
|
||||
private boolean printCost = Config.DEFAULT_PRINT_COST;
|
||||
private long costBound = Config.DEFAULT_COST_BOUND;
|
||||
|
||||
public InsertTask(
|
||||
final long taskId,
|
||||
Queue<List<Record>> recordsQueue,
|
||||
Configuration config,
|
||||
ServerConnectInfo connectInfo,
|
||||
String writeRecordSql,
|
||||
List<Pair<String, int[]>> deleteMeta) {
|
||||
this.taskId = taskId;
|
||||
this.queue = recordsQueue;
|
||||
this.connInfo = connectInfo;
|
||||
failTryCount = config.getInt(Config.FAIL_TRY_COUNT, Config.DEFAULT_FAIL_TRY_COUNT);
|
||||
printCost = config.getBool(Config.PRINT_COST, Config.DEFAULT_PRINT_COST);
|
||||
costBound = config.getLong(Config.COST_BOUND, Config.DEFAULT_COST_BOUND);
|
||||
this.connHolder = new ObClientConnHolder(config, connInfo.jdbcUrl,
|
||||
connInfo.getFullUserName(), connInfo.password);
|
||||
this.writeRecordSql = writeRecordSql;
|
||||
this.isStop = false;
|
||||
this.deleteMeta = deleteMeta;
|
||||
connHolder.initConnection();
|
||||
}
|
||||
|
||||
void setWriterTask(ConcurrentTableWriterTask writerTask) {
|
||||
this.writerTask = writerTask;
|
||||
}
|
||||
|
||||
void setWriter(ConcurrentTableWriter writer) {
|
||||
this.writer = writer;
|
||||
}
|
||||
public InsertTask(
|
||||
final long taskId,
|
||||
BlockingQueue<List<Record>> recordsQueue,
|
||||
Configuration config,
|
||||
ServerConnectInfo connectInfo,
|
||||
String writeRecordSql) {
|
||||
this.taskId = taskId;
|
||||
this.queue = recordsQueue;
|
||||
this.connInfo = connectInfo;
|
||||
failTryCount = config.getInt(Config.FAIL_TRY_COUNT, Config.DEFAULT_FAIL_TRY_COUNT);
|
||||
printCost = config.getBool(Config.PRINT_COST, Config.DEFAULT_PRINT_COST);
|
||||
costBound = config.getLong(Config.COST_BOUND, Config.DEFAULT_COST_BOUND);
|
||||
this.connHolder = new ObClientConnHolder(config, connInfo.jdbcUrl,
|
||||
connInfo.getFullUserName(), connInfo.password);
|
||||
this.writeRecordSql = writeRecordSql;
|
||||
this.isStop = false;
|
||||
connHolder.initConnection();
|
||||
}
|
||||
|
||||
private boolean isStop() { return isStop; }
|
||||
public void setStop() { isStop = true; }
|
||||
public long getTotalCost() { return totalCost; }
|
||||
public long getInsertCount() { return insertCount; }
|
||||
|
||||
@Override
|
||||
public void run() {
|
||||
Thread.currentThread().setName(String.format("%d-insertTask-%d", taskId, Thread.currentThread().getId()));
|
||||
LOG.debug("Task {} start to execute...", taskId);
|
||||
while (!isStop()) {
|
||||
try {
|
||||
List<Record> records = queue.poll();
|
||||
if (null != records) {
|
||||
doMultiInsert(records, this.printCost, this.costBound);
|
||||
void setWriterTask(ConcurrentTableWriterTask writerTask) {
|
||||
this.writerTask = writerTask;
|
||||
}
|
||||
|
||||
} else if (writerTask.isFinished()) {
|
||||
writerTask.singalTaskFinish();
|
||||
LOG.debug("not more task, thread exist ...");
|
||||
break;
|
||||
} else {
|
||||
TimeUnit.MILLISECONDS.sleep(5);
|
||||
}
|
||||
} catch (InterruptedException e) {
|
||||
LOG.debug("TableWriter is interrupt");
|
||||
} catch (Exception e) {
|
||||
LOG.warn("ERROR UNEXPECTED {}", e);
|
||||
}
|
||||
}
|
||||
LOG.debug("Thread exist...");
|
||||
}
|
||||
|
||||
public void destroy() {
|
||||
connHolder.destroy();
|
||||
};
|
||||
|
||||
public void calStatistic(final long cost) {
|
||||
writer.increFinishCount();
|
||||
++insertCount;
|
||||
totalCost += cost;
|
||||
if (this.printCost && cost > this.costBound) {
|
||||
LOG.info("slow multi insert cost {}ms", cost);
|
||||
}
|
||||
}
|
||||
void setWriter(ConcurrentTableWriter writer) {
|
||||
this.writer = writer;
|
||||
}
|
||||
|
||||
private void doDelete(Connection conn, final List<Record> buffer) throws SQLException {
|
||||
if(deleteMeta == null || deleteMeta.size() == 0) {
|
||||
return;
|
||||
}
|
||||
for (int i = 0; i < deleteMeta.size(); i++) {
|
||||
String deleteSql = deleteMeta.get(i).getKey();
|
||||
int[] valueIdx = deleteMeta.get(i).getValue();
|
||||
PreparedStatement ps = null;
|
||||
try {
|
||||
ps = conn.prepareStatement(deleteSql);
|
||||
StringBuilder builder = new StringBuilder();
|
||||
for (Record record : buffer) {
|
||||
int bindIndex = 0;
|
||||
for (int idx : valueIdx) {
|
||||
writerTask.fillStatementIndex(ps, bindIndex++, idx, record.getColumn(idx));
|
||||
builder.append(record.getColumn(idx).asString()).append(",");
|
||||
}
|
||||
ps.addBatch();
|
||||
}
|
||||
LOG.debug("delete values: " + builder.toString());
|
||||
ps.executeBatch();
|
||||
} catch (SQLException ex) {
|
||||
LOG.error("SQL Exception when delete records with {}", deleteSql, ex);
|
||||
throw ex;
|
||||
} finally {
|
||||
DBUtil.closeDBResources(ps, null);
|
||||
}
|
||||
}
|
||||
}
|
||||
private boolean isStop() {
|
||||
return isStop;
|
||||
}
|
||||
|
||||
public void doMultiInsert(final List<Record> buffer, final boolean printCost, final long restrict) {
|
||||
checkMemstore();
|
||||
Connection conn = connHolder.getConn();
|
||||
boolean success = false;
|
||||
long cost = 0;
|
||||
long startTime = 0;
|
||||
try {
|
||||
for (int i = 0; i < failTryCount; ++i) {
|
||||
if (i > 0) {
|
||||
try {
|
||||
int sleep = i >= 9 ? 500 : 1 << i;//不明白为什么要sleep 500s
|
||||
TimeUnit.SECONDS.sleep(sleep);
|
||||
} catch (InterruptedException e) {
|
||||
LOG.info("thread interrupted ..., ignore");
|
||||
}
|
||||
conn = connHolder.getConn();
|
||||
LOG.info("retry {}, start do batch insert, size={}", i, buffer.size());
|
||||
checkMemstore();
|
||||
}
|
||||
startTime = System.currentTimeMillis();
|
||||
PreparedStatement ps = null;
|
||||
try {
|
||||
conn.setAutoCommit(false);
|
||||
public void setStop() {
|
||||
isStop = true;
|
||||
}
|
||||
|
||||
// do delete if necessary
|
||||
doDelete(conn, buffer);
|
||||
public long getTotalCost() {
|
||||
return totalCost;
|
||||
}
|
||||
|
||||
ps = conn.prepareStatement(writeRecordSql);
|
||||
for (Record record : buffer) {
|
||||
ps = writerTask.fillStatement(ps, record);
|
||||
ps.addBatch();
|
||||
}
|
||||
ps.executeBatch();
|
||||
conn.commit();
|
||||
success = true;
|
||||
cost = System.currentTimeMillis() - startTime;
|
||||
calStatistic(cost);
|
||||
break;
|
||||
} catch (SQLException e) {
|
||||
LOG.warn("Insert fatal error SqlState ={}, errorCode = {}, {}", e.getSQLState(), e.getErrorCode(), e);
|
||||
if (i == 0 || i > 10 ) {
|
||||
for (Record record : buffer) {
|
||||
LOG.warn("ERROR : record {}", record);
|
||||
}
|
||||
}
|
||||
// 按照错误码分类,分情况处理
|
||||
// 如果是OB系统级异常,则需要重建连接
|
||||
boolean fatalFail = ObWriterUtils.isFatalError(e);
|
||||
if (fatalFail) {
|
||||
ObWriterUtils.sleep(300000);
|
||||
connHolder.reconnect();
|
||||
// 如果是可恢复的异常,则重试
|
||||
} else if (ObWriterUtils.isRecoverableError(e)) {
|
||||
conn.rollback();
|
||||
ObWriterUtils.sleep(60000);
|
||||
} else {// 其它异常直接退出,采用逐条写入方式
|
||||
conn.rollback();
|
||||
ObWriterUtils.sleep(1000);
|
||||
break;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
LOG.warn("Insert error unexpected {}", e);
|
||||
} finally {
|
||||
DBUtil.closeDBResources(ps, null);
|
||||
}
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
LOG.warn("ERROR:retry failSql State ={}, errorCode = {}, {}", e.getSQLState(), e.getErrorCode(), e);
|
||||
}
|
||||
public long getInsertCount() {
|
||||
return insertCount;
|
||||
}
|
||||
|
||||
if (!success) {
|
||||
try {
|
||||
LOG.info("do one insert");
|
||||
conn = connHolder.reconnect();
|
||||
doOneInsert(conn, buffer);
|
||||
cost = System.currentTimeMillis() - startTime;
|
||||
calStatistic(cost);
|
||||
} finally {
|
||||
}
|
||||
}
|
||||
}
|
||||
@Override
|
||||
public void run() {
|
||||
Thread.currentThread().setName(String.format("%d-insertTask-%d", taskId, Thread.currentThread().getId()));
|
||||
LOG.debug("Task {} start to execute...", taskId);
|
||||
while (!isStop()) {
|
||||
try {
|
||||
List<Record> records = queue.poll(5, TimeUnit.MILLISECONDS);
|
||||
if (null != records) {
|
||||
doMultiInsert(records, this.printCost, this.costBound);
|
||||
} else if (writerTask.isFinished()) {
|
||||
writerTask.singalTaskFinish();
|
||||
LOG.debug("not more task, thread exist ...");
|
||||
break;
|
||||
}
|
||||
} catch (InterruptedException e) {
|
||||
LOG.debug("TableWriter is interrupt");
|
||||
} catch (Exception e) {
|
||||
LOG.warn("ERROR UNEXPECTED ", e);
|
||||
}
|
||||
}
|
||||
LOG.debug("Thread exist...");
|
||||
}
|
||||
|
||||
// process one row, delete before insert
|
||||
private void doOneInsert(Connection connection, List<Record> buffer) {
|
||||
List<PreparedStatement> deletePstmtList = new ArrayList();
|
||||
PreparedStatement preparedStatement = null;
|
||||
try {
|
||||
connection.setAutoCommit(false);
|
||||
if (deleteMeta != null && deleteMeta.size() > 0) {
|
||||
for (int i = 0; i < deleteMeta.size(); i++) {
|
||||
String deleteSql = deleteMeta.get(i).getKey();
|
||||
deletePstmtList.add(connection.prepareStatement(deleteSql));
|
||||
}
|
||||
}
|
||||
public void destroy() {
|
||||
connHolder.destroy();
|
||||
}
|
||||
|
||||
preparedStatement = connection.prepareStatement(this.writeRecordSql);
|
||||
for (Record record : buffer) {
|
||||
try {
|
||||
for (int i = 0; i < deletePstmtList.size(); i++) {
|
||||
PreparedStatement deleteStmt = deletePstmtList.get(i);
|
||||
int[] valueIdx = deleteMeta.get(i).getValue();
|
||||
int bindIndex = 0;
|
||||
for (int idx : valueIdx) {
|
||||
writerTask.fillStatementIndex(deleteStmt, bindIndex++, idx, record.getColumn(idx));
|
||||
}
|
||||
deleteStmt.execute();
|
||||
}
|
||||
preparedStatement = writerTask.fillStatement(preparedStatement, record);
|
||||
preparedStatement.execute();
|
||||
connection.commit();
|
||||
} catch (SQLException e) {
|
||||
writerTask.collectDirtyRecord(record, e);
|
||||
} finally {
|
||||
// 此处不应该关闭statement,后续的数据还需要用到
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
throw DataXException.asDataXException(
|
||||
DBUtilErrorCode.WRITE_DATA_ERROR, e);
|
||||
} finally {
|
||||
DBUtil.closeDBResources(preparedStatement, null);
|
||||
for (PreparedStatement pstmt : deletePstmtList) {
|
||||
DBUtil.closeDBResources(pstmt, null);
|
||||
}
|
||||
}
|
||||
}
|
||||
public void calStatistic(final long cost) {
|
||||
writer.increFinishCount();
|
||||
++insertCount;
|
||||
totalCost += cost;
|
||||
if (this.printCost && cost > this.costBound) {
|
||||
LOG.info("slow multi insert cost {}ms", cost);
|
||||
}
|
||||
}
|
||||
|
||||
private void checkMemstore() {
|
||||
while (writerTask.isMemStoreFull()) {
|
||||
ObWriterUtils.sleep(30000);
|
||||
}
|
||||
}
|
||||
public void doMultiInsert(final List<Record> buffer, final boolean printCost, final long restrict) {
|
||||
checkMemstore();
|
||||
Connection conn = connHolder.getConn();
|
||||
boolean success = false;
|
||||
long cost = 0;
|
||||
long startTime = 0;
|
||||
try {
|
||||
for (int i = 0; i < failTryCount; ++i) {
|
||||
if (i > 0) {
|
||||
conn = connHolder.getConn();
|
||||
LOG.info("retry {}, start do batch insert, size={}", i, buffer.size());
|
||||
checkMemstore();
|
||||
}
|
||||
startTime = System.currentTimeMillis();
|
||||
PreparedStatement ps = null;
|
||||
try {
|
||||
conn.setAutoCommit(false);
|
||||
ps = conn.prepareStatement(writeRecordSql);
|
||||
for (Record record : buffer) {
|
||||
ps = writerTask.fillStatement(ps, record);
|
||||
ps.addBatch();
|
||||
}
|
||||
ps.executeBatch();
|
||||
conn.commit();
|
||||
success = true;
|
||||
cost = System.currentTimeMillis() - startTime;
|
||||
calStatistic(cost);
|
||||
break;
|
||||
} catch (SQLException e) {
|
||||
LOG.warn("Insert fatal error SqlState ={}, errorCode = {}, {}", e.getSQLState(), e.getErrorCode(), e);
|
||||
if (LOG.isDebugEnabled() && (i == 0 || i > 10)) {
|
||||
for (Record record : buffer) {
|
||||
LOG.warn("ERROR : record {}", record);
|
||||
}
|
||||
}
|
||||
// 按照错误码分类,分情况处理
|
||||
// 如果是OB系统级异常,则需要重建连接
|
||||
boolean fatalFail = ObWriterUtils.isFatalError(e);
|
||||
if (fatalFail) {
|
||||
ObWriterUtils.sleep(300000);
|
||||
connHolder.reconnect();
|
||||
// 如果是可恢复的异常,则重试
|
||||
} else if (ObWriterUtils.isRecoverableError(e)) {
|
||||
conn.rollback();
|
||||
ObWriterUtils.sleep(60000);
|
||||
} else {// 其它异常直接退出,采用逐条写入方式
|
||||
conn.rollback();
|
||||
ObWriterUtils.sleep(1000);
|
||||
break;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
LOG.warn("Insert error unexpected {}", e);
|
||||
} finally {
|
||||
DBUtil.closeDBResources(ps, null);
|
||||
}
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
LOG.warn("ERROR:retry failSql State ={}, errorCode = {}, {}", e.getSQLState(), e.getErrorCode(), e);
|
||||
}
|
||||
|
||||
if (!success) {
|
||||
LOG.info("do one insert");
|
||||
conn = connHolder.reconnect();
|
||||
writerTask.doOneInsert(conn, buffer);
|
||||
cost = System.currentTimeMillis() - startTime;
|
||||
calStatistic(cost);
|
||||
}
|
||||
}
|
||||
|
||||
private void checkMemstore() {
|
||||
if (writerTask.isShouldSlow()) {
|
||||
ObWriterUtils.sleep(100);
|
||||
} else {
|
||||
while (writerTask.isShouldPause()) {
|
||||
ObWriterUtils.sleep(100);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -12,7 +12,7 @@ import com.alibaba.datax.plugin.rdbms.util.DataBaseType;
|
||||
import com.alibaba.datax.plugin.rdbms.writer.CommonRdbmsWriter;
|
||||
import com.alibaba.datax.plugin.rdbms.writer.Key;
|
||||
import com.alibaba.datax.plugin.writer.oceanbasev10writer.Config;
|
||||
import com.alibaba.datax.plugin.writer.oceanbasev10writer.ext.ConnHolder;
|
||||
import com.alibaba.datax.plugin.writer.oceanbasev10writer.ext.AbstractConnHolder;
|
||||
import com.alibaba.datax.plugin.writer.oceanbasev10writer.ext.ObClientConnHolder;
|
||||
import com.alibaba.datax.plugin.writer.oceanbasev10writer.util.ObWriterUtils;
|
||||
|
||||
@ -30,7 +30,7 @@ public class SingleTableWriterTask extends CommonRdbmsWriter.Task {
|
||||
// 失败重试次数
|
||||
private int failTryCount = Config.DEFAULT_FAIL_TRY_COUNT;
|
||||
|
||||
private ConnHolder connHolder;
|
||||
private AbstractConnHolder connHolder;
|
||||
private String obWriteMode = "update";
|
||||
private boolean isOracleCompatibleMode = false;
|
||||
private String obUpdateColumns = null;
|
||||
|
@ -66,4 +66,48 @@ public class DbUtils {
|
||||
|
||||
return value;
|
||||
}
|
||||
|
||||
/**
|
||||
* build sys connection from ordinary jdbc url
|
||||
*
|
||||
* @param jdbcUrl
|
||||
* @param clusterName
|
||||
* @return
|
||||
* @throws Exception
|
||||
*/
|
||||
public static Connection buildSysConn(String jdbcUrl, String clusterName) throws Exception {
|
||||
jdbcUrl = jdbcUrl.replace("jdbc:mysql://", "jdbc:oceanbase://");
|
||||
int startIdx = jdbcUrl.indexOf('/', "jdbc:oceanbase://".length());
|
||||
int endIdx = jdbcUrl.lastIndexOf('?');
|
||||
String prefix = jdbcUrl.substring(0, startIdx + 1);
|
||||
final String postfix = jdbcUrl.substring(endIdx);
|
||||
String sysJDBCUrl = prefix + "oceanbase" + postfix;
|
||||
|
||||
String tenantName = "sys";
|
||||
String[][] userConfigs = {
|
||||
{"monitor", "monitor"}
|
||||
};
|
||||
|
||||
Connection conn = null;
|
||||
for (String[] userConfig : userConfigs) {
|
||||
try {
|
||||
conn = DBUtil.getConnectionWithoutRetry(DataBaseType.OceanBase, sysJDBCUrl, String.format("%s@%s#%s", userConfig[0],
|
||||
tenantName, clusterName), userConfig[1]);
|
||||
} catch (Exception e) {
|
||||
LOG.warn("fail connecting to ob: " + e.getMessage());
|
||||
|
||||
}
|
||||
if (conn == null) {
|
||||
LOG.warn("fail to get connection with user " + userConfig[0] + ", try alternative user.");
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (conn == null) {
|
||||
throw new Exception("fail to get connection with sys tenant.");
|
||||
}
|
||||
|
||||
return conn;
|
||||
}
|
||||
}
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user