Merge remote-tracking branch 'origin/master' into mergeUpstream

# Conflicts:
#	tdenginereader/pom.xml
#	tdenginewriter/pom.xml
#	tdenginewriter/src/main/java/com/alibaba/datax/plugin/writer/tdenginewriter/DefaultDataHandler.java
This commit is contained in:
dingxiaobo 2022-10-26 14:55:46 +08:00
commit 8fadb0c11e
176 changed files with 8633 additions and 2514 deletions

View File

@ -25,7 +25,7 @@ DataX本身作为数据同步框架将不同数据源的同步抽象为从源
# Quick Start # Quick Start
##### Download [DataX下载地址](https://datax-opensource.oss-cn-hangzhou.aliyuncs.com/20220530/datax.tar.gz) ##### Download [DataX下载地址](https://datax-opensource.oss-cn-hangzhou.aliyuncs.com/202209/datax.tar.gz)
##### 请点击:[Quick Start](https://github.com/alibaba/DataX/blob/master/userGuid.md) ##### 请点击:[Quick Start](https://github.com/alibaba/DataX/blob/master/userGuid.md)
@ -44,6 +44,8 @@ DataX目前已经有了比较全面的插件体系主流的RDBMS数据库、N
| | SQLServer | √ | √ |[读](https://github.com/alibaba/DataX/blob/master/sqlserverreader/doc/sqlserverreader.md) 、[写](https://github.com/alibaba/DataX/blob/master/sqlserverwriter/doc/sqlserverwriter.md)| | | SQLServer | √ | √ |[读](https://github.com/alibaba/DataX/blob/master/sqlserverreader/doc/sqlserverreader.md) 、[写](https://github.com/alibaba/DataX/blob/master/sqlserverwriter/doc/sqlserverwriter.md)|
| | PostgreSQL | √ | √ |[读](https://github.com/alibaba/DataX/blob/master/postgresqlreader/doc/postgresqlreader.md) 、[写](https://github.com/alibaba/DataX/blob/master/postgresqlwriter/doc/postgresqlwriter.md)| | | PostgreSQL | √ | √ |[读](https://github.com/alibaba/DataX/blob/master/postgresqlreader/doc/postgresqlreader.md) 、[写](https://github.com/alibaba/DataX/blob/master/postgresqlwriter/doc/postgresqlwriter.md)|
| | DRDS | √ | √ |[读](https://github.com/alibaba/DataX/blob/master/drdsreader/doc/drdsreader.md) 、[写](https://github.com/alibaba/DataX/blob/master/drdswriter/doc/drdswriter.md)| | | DRDS | √ | √ |[读](https://github.com/alibaba/DataX/blob/master/drdsreader/doc/drdsreader.md) 、[写](https://github.com/alibaba/DataX/blob/master/drdswriter/doc/drdswriter.md)|
| | Apache Doris | | √ |[写](https://github.com/alibaba/DataX/blob/master/doriswriter/doc/doriswriter.md)|
| | StarRocks | | √ |[写](https://github.com/alibaba/DataX/blob/master/starrockswriter/doc/starrockswriter.md)|
| | 通用RDBMS(支持所有关系型数据库) | √ | √ |[读](https://github.com/alibaba/DataX/blob/master/rdbmsreader/doc/rdbmsreader.md) 、[写](https://github.com/alibaba/DataX/blob/master/rdbmswriter/doc/rdbmswriter.md)| | | 通用RDBMS(支持所有关系型数据库) | √ | √ |[读](https://github.com/alibaba/DataX/blob/master/rdbmsreader/doc/rdbmsreader.md) 、[写](https://github.com/alibaba/DataX/blob/master/rdbmswriter/doc/rdbmswriter.md)|
| 阿里云数仓数据存储 | ODPS | √ | √ |[读](https://github.com/alibaba/DataX/blob/master/odpsreader/doc/odpsreader.md) 、[写](https://github.com/alibaba/DataX/blob/master/odpswriter/doc/odpswriter.md)| | 阿里云数仓数据存储 | ODPS | √ | √ |[读](https://github.com/alibaba/DataX/blob/master/odpsreader/doc/odpsreader.md) 、[写](https://github.com/alibaba/DataX/blob/master/odpswriter/doc/odpswriter.md)|
| | ADS | | √ |[写](https://github.com/alibaba/DataX/blob/master/adswriter/doc/adswriter.md)| | | ADS | | √ |[写](https://github.com/alibaba/DataX/blob/master/adswriter/doc/adswriter.md)|
@ -95,6 +97,9 @@ DataX目前已经有了比较全面的插件体系主流的RDBMS数据库、N
DataX 后续计划月度迭代更新,也欢迎感兴趣的同学提交 Pull requests月度更新内容会介绍介绍如下。 DataX 后续计划月度迭代更新,也欢迎感兴趣的同学提交 Pull requests月度更新内容会介绍介绍如下。
- [datax_v202209]https://github.com/alibaba/DataX/releases/tag/datax_v202209)
- 涉及通道能力更新MaxCompute、Datahub、SLS等、安全漏洞更新、通用打包更新等
- [datax_v202205]https://github.com/alibaba/DataX/releases/tag/datax_v202205) - [datax_v202205]https://github.com/alibaba/DataX/releases/tag/datax_v202205)
- 涉及通道能力更新MaxCompute、Hologres、OSS、Tdengine等、安全漏洞更新、通用打包更新等 - 涉及通道能力更新MaxCompute、Hologres、OSS、Tdengine等、安全漏洞更新、通用打包更新等

View File

@ -70,7 +70,7 @@ public class DataType {
} else if ("datetime".equals(type)) { } else if ("datetime".equals(type)) {
return DATETIME; return DATETIME;
} else { } else {
throw new IllegalArgumentException("unkown type: " + type); throw new IllegalArgumentException("unknown type: " + type);
} }
} }

View File

@ -68,7 +68,7 @@ public class ClickhouseWriter extends Writer {
this.commonRdbmsWriterSlave = new CommonRdbmsWriter.Task(DATABASE_TYPE) { this.commonRdbmsWriterSlave = new CommonRdbmsWriter.Task(DATABASE_TYPE) {
@Override @Override
protected PreparedStatement fillPreparedStatementColumnType(PreparedStatement preparedStatement, int columnIndex, int columnSqltype, Column column) throws SQLException { protected PreparedStatement fillPreparedStatementColumnType(PreparedStatement preparedStatement, int columnIndex, int columnSqltype, String typeName, Column column) throws SQLException {
try { try {
if (column.getRawData() == null) { if (column.getRawData() == null) {
preparedStatement.setNull(columnIndex + 1, columnSqltype); preparedStatement.setNull(columnIndex + 1, columnSqltype);

View File

@ -2,5 +2,5 @@
"name": "clickhousewriter", "name": "clickhousewriter",
"class": "com.alibaba.datax.plugin.writer.clickhousewriter.ClickhouseWriter", "class": "com.alibaba.datax.plugin.writer.clickhousewriter.ClickhouseWriter",
"description": "useScene: prod. mechanism: Jdbc connection using the database, execute insert sql.", "description": "useScene: prod. mechanism: Jdbc connection using the database, execute insert sql.",
"developer": "jiye.tjy" "developer": "alibaba"
} }

View File

@ -411,6 +411,15 @@ public class Configuration {
return list; return list;
} }
public <T> List<T> getListWithJson(final String path, Class<T> t) {
Object object = this.get(path, List.class);
if (null == object) {
return null;
}
return JSON.parseArray(JSON.toJSONString(object),t);
}
/** /**
* 根据用户提供的json path寻址List对象如果对象不存在返回null * 根据用户提供的json path寻址List对象如果对象不存在返回null
*/ */

View File

@ -3,6 +3,8 @@ package com.alibaba.datax.common.util;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.Validate; import org.apache.commons.lang3.Validate;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.text.DecimalFormat; import java.text.DecimalFormat;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
@ -82,4 +84,20 @@ public class StrUtil {
return s.substring(0, headLength) + "..." + s.substring(s.length() - tailLength); return s.substring(0, headLength) + "..." + s.substring(s.length() - tailLength);
} }
public static String getMd5(String plainText) {
try {
StringBuilder builder = new StringBuilder();
for (byte b : MessageDigest.getInstance("MD5").digest(plainText.getBytes())) {
int i = b & 0xff;
if (i < 0x10) {
builder.append('0');
}
builder.append(Integer.toHexString(i));
}
return builder.toString();
} catch (NoSuchAlgorithmException e) {
throw new RuntimeException(e);
}
}
} }

View File

@ -41,7 +41,7 @@
<dependency> <dependency>
<groupId>org.apache.httpcomponents</groupId> <groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId> <artifactId>httpclient</artifactId>
<version>4.5</version> <version>4.5.13</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.apache.httpcomponents</groupId> <groupId>org.apache.httpcomponents</groupId>

View File

@ -0,0 +1,87 @@
package com.alibaba.datax.core.transport.transformer;
import com.alibaba.datax.common.element.Column;
import com.alibaba.datax.common.element.Record;
import com.alibaba.datax.common.element.StringColumn;
import com.alibaba.datax.common.exception.DataXException;
import com.alibaba.datax.transformer.Transformer;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.lang.StringUtils;
import java.util.Arrays;
/**
* no comments.
*
* @author XuDaojie
* @since 2021-08-16
*/
public class DigestTransformer extends Transformer {
private static final String MD5 = "md5";
private static final String SHA1 = "sha1";
private static final String TO_UPPER_CASE = "toUpperCase";
private static final String TO_LOWER_CASE = "toLowerCase";
public DigestTransformer() {
setTransformerName("dx_digest");
}
@Override
public Record evaluate(Record record, Object... paras) {
int columnIndex;
String type;
String charType;
try {
if (paras.length != 3) {
throw new RuntimeException("dx_digest paras length must be 3");
}
columnIndex = (Integer) paras[0];
type = (String) paras[1];
charType = (String) paras[2];
if (!StringUtils.equalsIgnoreCase(MD5, type) && !StringUtils.equalsIgnoreCase(SHA1, type)) {
throw new RuntimeException("dx_digest paras index 1 must be md5 or sha1");
}
if (!StringUtils.equalsIgnoreCase(TO_UPPER_CASE, charType) && !StringUtils.equalsIgnoreCase(TO_LOWER_CASE, charType)) {
throw new RuntimeException("dx_digest paras index 2 must be toUpperCase or toLowerCase");
}
} catch (Exception e) {
throw DataXException.asDataXException(TransformerErrorCode.TRANSFORMER_ILLEGAL_PARAMETER, "paras:" + Arrays.asList(paras) + " => " + e.getMessage());
}
Column column = record.getColumn(columnIndex);
try {
String oriValue = column.asString();
// 如果字段为空作为空字符串处理
if (oriValue == null) {
oriValue = "";
}
String newValue;
if (MD5.equals(type)) {
newValue = DigestUtils.md5Hex(oriValue);
} else {
newValue = DigestUtils.sha1Hex(oriValue);
}
if (TO_UPPER_CASE.equals(charType)) {
newValue = newValue.toUpperCase();
} else {
newValue = newValue.toLowerCase();
}
record.setColumn(columnIndex, new StringColumn(newValue));
} catch (Exception e) {
throw DataXException.asDataXException(TransformerErrorCode.TRANSFORMER_RUN_EXCEPTION, e.getMessage(), e);
}
return record;
}
}

View File

@ -61,7 +61,7 @@ public class FilterTransformer extends Transformer {
} else if (code.equalsIgnoreCase("<=")) { } else if (code.equalsIgnoreCase("<=")) {
return doLess(record, value, column, true); return doLess(record, value, column, true);
} else { } else {
throw new RuntimeException("dx_filter can't suport code:" + code); throw new RuntimeException("dx_filter can't support code:" + code);
} }
} catch (Exception e) { } catch (Exception e) {
throw DataXException.asDataXException(TransformerErrorCode.TRANSFORMER_RUN_EXCEPTION, e.getMessage(), e); throw DataXException.asDataXException(TransformerErrorCode.TRANSFORMER_RUN_EXCEPTION, e.getMessage(), e);

View File

@ -1,10 +1,18 @@
package com.alibaba.datax.core.transport.transformer; package com.alibaba.datax.core.transport.transformer;
import org.apache.commons.codec.digest.DigestUtils;
/** /**
* GroovyTransformer的帮助类供groovy代码使用必须全是static的方法 * GroovyTransformer的帮助类供groovy代码使用必须全是static的方法
* Created by liqiang on 16/3/4. * Created by liqiang on 16/3/4.
*/ */
public class GroovyTransformerStaticUtil { public class GroovyTransformerStaticUtil {
public static String md5(final String data) {
return DigestUtils.md5Hex(data);
}
public static String sha1(final String data) {
return DigestUtils.sha1Hex(data);
}
} }

View File

@ -36,6 +36,7 @@ public class TransformerRegistry {
registTransformer(new ReplaceTransformer()); registTransformer(new ReplaceTransformer());
registTransformer(new FilterTransformer()); registTransformer(new FilterTransformer());
registTransformer(new GroovyTransformer()); registTransformer(new GroovyTransformer());
registTransformer(new DigestTransformer());
} }
public static void loadTransformerFromLocalStorage() { public static void loadTransformerFromLocalStorage() {

View File

@ -2,7 +2,7 @@
"job": { "job": {
"setting": { "setting": {
"speed": { "speed": {
"byte":10485760 "channel":1
}, },
"errorLimit": { "errorLimit": {
"record": 0, "record": 0,

79
datahubreader/pom.xml Normal file
View File

@ -0,0 +1,79 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>datax-all</artifactId>
<groupId>com.alibaba.datax</groupId>
<version>0.0.1-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>datahubreader</artifactId>
<version>0.0.1-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>com.alibaba.datax</groupId>
<artifactId>datax-common</artifactId>
<version>${datax-project-version}</version>
<exclusions>
<exclusion>
<artifactId>slf4j-log4j12</artifactId>
<groupId>org.slf4j</groupId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
</dependency>
<dependency>
<groupId>ch.qos.logback</groupId>
<artifactId>logback-classic</artifactId>
</dependency>
<dependency>
<groupId>com.aliyun.datahub</groupId>
<artifactId>aliyun-sdk-datahub</artifactId>
<version>2.21.6-public</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
<scope>test</scope>
</dependency>
</dependencies>
<build>
<plugins>
<!-- compiler plugin -->
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>${jdk-version}</source>
<target>${jdk-version}</target>
<encoding>${project-sourceEncoding}</encoding>
</configuration>
</plugin>
<!-- assembly plugin -->
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<descriptors>
<descriptor>src/main/assembly/package.xml</descriptor>
</descriptors>
<finalName>datax</finalName>
</configuration>
<executions>
<execution>
<id>dwzip</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>

View File

@ -0,0 +1,34 @@
<assembly
xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0 http://maven.apache.org/xsd/assembly-1.1.0.xsd">
<id></id>
<formats>
<format>dir</format>
</formats>
<includeBaseDirectory>false</includeBaseDirectory>
<fileSets>
<fileSet>
<directory>src/main/resources</directory>
<includes>
<include>plugin.json</include>
</includes>
<outputDirectory>plugin/reader/datahubreader</outputDirectory>
</fileSet>
<fileSet>
<directory>target/</directory>
<includes>
<include>datahubreader-0.0.1-SNAPSHOT.jar</include>
</includes>
<outputDirectory>plugin/reader/datahubreader</outputDirectory>
</fileSet>
</fileSets>
<dependencySets>
<dependencySet>
<useProjectArtifact>false</useProjectArtifact>
<outputDirectory>plugin/reader/datahubreader/libs</outputDirectory>
<scope>runtime</scope>
</dependencySet>
</dependencySets>
</assembly>

View File

@ -0,0 +1,8 @@
package com.alibaba.datax.plugin.reader.datahubreader;
public class Constant {
public static String DATETIME_FORMAT = "yyyyMMddHHmmss";
public static String DATE_FORMAT = "yyyyMMdd";
}

View File

@ -0,0 +1,42 @@
package com.alibaba.datax.plugin.reader.datahubreader;
import com.alibaba.datax.common.util.Configuration;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.TypeReference;
import com.aliyun.datahub.client.DatahubClient;
import com.aliyun.datahub.client.DatahubClientBuilder;
import com.aliyun.datahub.client.auth.Account;
import com.aliyun.datahub.client.auth.AliyunAccount;
import com.aliyun.datahub.client.common.DatahubConfig;
import com.aliyun.datahub.client.http.HttpConfig;
import org.apache.commons.lang3.StringUtils;
public class DatahubClientHelper {
public static DatahubClient getDatahubClient(Configuration jobConfig) {
String accessId = jobConfig.getNecessaryValue(Key.CONFIG_KEY_ACCESS_ID,
DatahubWriterErrorCode.MISSING_REQUIRED_VALUE);
String accessKey = jobConfig.getNecessaryValue(Key.CONFIG_KEY_ACCESS_KEY,
DatahubWriterErrorCode.MISSING_REQUIRED_VALUE);
String endpoint = jobConfig.getNecessaryValue(Key.CONFIG_KEY_ENDPOINT,
DatahubWriterErrorCode.MISSING_REQUIRED_VALUE);
Account account = new AliyunAccount(accessId, accessKey);
// 是否开启二进制传输服务端2.12版本开始支持
boolean enableBinary = jobConfig.getBool("enableBinary", false);
DatahubConfig datahubConfig = new DatahubConfig(endpoint, account, enableBinary);
// HttpConfig可不设置不设置时采用默认值
// 读写数据推荐打开网络传输 LZ4压缩
HttpConfig httpConfig = null;
String httpConfigStr = jobConfig.getString("httpConfig");
if (StringUtils.isNotBlank(httpConfigStr)) {
httpConfig = JSON.parseObject(httpConfigStr, new TypeReference<HttpConfig>() {
});
}
DatahubClientBuilder builder = DatahubClientBuilder.newBuilder().setDatahubConfig(datahubConfig);
if (null != httpConfig) {
builder.setHttpConfig(httpConfig);
}
DatahubClient datahubClient = builder.build();
return datahubClient;
}
}

View File

@ -0,0 +1,292 @@
package com.alibaba.datax.plugin.reader.datahubreader;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import com.aliyun.datahub.client.model.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.datax.common.element.Column;
import com.alibaba.datax.common.element.Record;
import com.alibaba.datax.common.element.StringColumn;
import com.alibaba.datax.common.exception.DataXException;
import com.alibaba.datax.common.plugin.RecordSender;
import com.alibaba.datax.common.spi.Reader;
import com.alibaba.datax.common.util.Configuration;
import com.aliyun.datahub.client.DatahubClient;
public class DatahubReader extends Reader {
public static class Job extends Reader.Job {
private static final Logger LOG = LoggerFactory.getLogger(Job.class);
private Configuration originalConfig;
private Long beginTimestampMillis;
private Long endTimestampMillis;
DatahubClient datahubClient;
@Override
public void init() {
LOG.info("datahub reader job init begin ...");
this.originalConfig = super.getPluginJobConf();
validateParameter(originalConfig);
this.datahubClient = DatahubClientHelper.getDatahubClient(this.originalConfig);
LOG.info("datahub reader job init end.");
}
private void validateParameter(Configuration conf){
conf.getNecessaryValue(Key.ENDPOINT,DatahubReaderErrorCode.REQUIRE_VALUE);
conf.getNecessaryValue(Key.ACCESSKEYID,DatahubReaderErrorCode.REQUIRE_VALUE);
conf.getNecessaryValue(Key.ACCESSKEYSECRET,DatahubReaderErrorCode.REQUIRE_VALUE);
conf.getNecessaryValue(Key.PROJECT,DatahubReaderErrorCode.REQUIRE_VALUE);
conf.getNecessaryValue(Key.TOPIC,DatahubReaderErrorCode.REQUIRE_VALUE);
conf.getNecessaryValue(Key.COLUMN,DatahubReaderErrorCode.REQUIRE_VALUE);
conf.getNecessaryValue(Key.BEGINDATETIME,DatahubReaderErrorCode.REQUIRE_VALUE);
conf.getNecessaryValue(Key.ENDDATETIME,DatahubReaderErrorCode.REQUIRE_VALUE);
int batchSize = this.originalConfig.getInt(Key.BATCHSIZE, 1024);
if (batchSize > 10000) {
throw DataXException.asDataXException(DatahubReaderErrorCode.BAD_CONFIG_VALUE,
"Invalid batchSize[" + batchSize + "] value (0,10000]!");
}
String beginDateTime = this.originalConfig.getString(Key.BEGINDATETIME);
if (beginDateTime != null) {
try {
beginTimestampMillis = DatahubReaderUtils.getUnixTimeFromDateTime(beginDateTime);
} catch (ParseException e) {
throw DataXException.asDataXException(DatahubReaderErrorCode.BAD_CONFIG_VALUE,
"Invalid beginDateTime[" + beginDateTime + "], format [yyyyMMddHHmmss]!");
}
}
if (beginTimestampMillis != null && beginTimestampMillis <= 0) {
throw DataXException.asDataXException(DatahubReaderErrorCode.BAD_CONFIG_VALUE,
"Invalid beginTimestampMillis[" + beginTimestampMillis + "]!");
}
String endDateTime = this.originalConfig.getString(Key.ENDDATETIME);
if (endDateTime != null) {
try {
endTimestampMillis = DatahubReaderUtils.getUnixTimeFromDateTime(endDateTime);
} catch (ParseException e) {
throw DataXException.asDataXException(DatahubReaderErrorCode.BAD_CONFIG_VALUE,
"Invalid beginDateTime[" + endDateTime + "], format [yyyyMMddHHmmss]!");
}
}
if (endTimestampMillis != null && endTimestampMillis <= 0) {
throw DataXException.asDataXException(DatahubReaderErrorCode.BAD_CONFIG_VALUE,
"Invalid endTimestampMillis[" + endTimestampMillis + "]!");
}
if (beginTimestampMillis != null && endTimestampMillis != null
&& endTimestampMillis <= beginTimestampMillis) {
throw DataXException.asDataXException(DatahubReaderErrorCode.BAD_CONFIG_VALUE,
"endTimestampMillis[" + endTimestampMillis + "] must bigger than beginTimestampMillis[" + beginTimestampMillis + "]!");
}
}
@Override
public void prepare() {
// create datahub client
String project = originalConfig.getNecessaryValue(Key.PROJECT, DatahubReaderErrorCode.REQUIRE_VALUE);
String topic = originalConfig.getNecessaryValue(Key.TOPIC, DatahubReaderErrorCode.REQUIRE_VALUE);
RecordType recordType = null;
try {
DatahubClient client = DatahubClientHelper.getDatahubClient(this.originalConfig);
GetTopicResult getTopicResult = client.getTopic(project, topic);
recordType = getTopicResult.getRecordType();
} catch (Exception e) {
LOG.warn("get topic type error: {}", e.getMessage());
}
if (null != recordType) {
if (recordType == RecordType.BLOB) {
throw DataXException.asDataXException(DatahubReaderErrorCode.BAD_CONFIG_VALUE,
"DatahubReader only support 'Tuple' RecordType now, but your RecordType is 'BLOB'");
}
}
}
@Override
public void destroy() {
}
@Override
public List<Configuration> split(int adviceNumber) {
LOG.info("split() begin...");
List<Configuration> readerSplitConfigs = new ArrayList<Configuration>();
String project = this.originalConfig.getString(Key.PROJECT);
String topic = this.originalConfig.getString(Key.TOPIC);
List<ShardEntry> shardEntrys = DatahubReaderUtils.getShardsWithRetry(this.datahubClient, project, topic);
if (shardEntrys == null || shardEntrys.isEmpty() || shardEntrys.size() == 0) {
throw DataXException.asDataXException(DatahubReaderErrorCode.BAD_CONFIG_VALUE,
"Project [" + project + "] Topic [" + topic + "] has no shards, please check !");
}
for (ShardEntry shardEntry : shardEntrys) {
Configuration splitedConfig = this.originalConfig.clone();
splitedConfig.set(Key.SHARDID, shardEntry.getShardId());
readerSplitConfigs.add(splitedConfig);
}
LOG.info("split() ok and end...");
return readerSplitConfigs;
}
}
public static class Task extends Reader.Task {
private static final Logger LOG = LoggerFactory.getLogger(Task.class);
private Configuration taskConfig;
private String accessId;
private String accessKey;
private String endpoint;
private String project;
private String topic;
private String shardId;
private Long beginTimestampMillis;
private Long endTimestampMillis;
private int batchSize;
private List<String> columns;
private RecordSchema schema;
private String timeStampUnit;
DatahubClient datahubClient;
@Override
public void init() {
this.taskConfig = super.getPluginJobConf();
this.accessId = this.taskConfig.getString(Key.ACCESSKEYID);
this.accessKey = this.taskConfig.getString(Key.ACCESSKEYSECRET);
this.endpoint = this.taskConfig.getString(Key.ENDPOINT);
this.project = this.taskConfig.getString(Key.PROJECT);
this.topic = this.taskConfig.getString(Key.TOPIC);
this.shardId = this.taskConfig.getString(Key.SHARDID);
this.batchSize = this.taskConfig.getInt(Key.BATCHSIZE, 1024);
this.timeStampUnit = this.taskConfig.getString(Key.TIMESTAMP_UNIT, "MICROSECOND");
try {
this.beginTimestampMillis = DatahubReaderUtils.getUnixTimeFromDateTime(this.taskConfig.getString(Key.BEGINDATETIME));
} catch (ParseException e) {
}
try {
this.endTimestampMillis = DatahubReaderUtils.getUnixTimeFromDateTime(this.taskConfig.getString(Key.ENDDATETIME));
} catch (ParseException e) {
}
this.columns = this.taskConfig.getList(Key.COLUMN, String.class);
this.datahubClient = DatahubClientHelper.getDatahubClient(this.taskConfig);
this.schema = DatahubReaderUtils.getDatahubSchemaWithRetry(this.datahubClient, this.project, topic);
LOG.info("init datahub reader task finished.project:{} topic:{} batchSize:{}", project, topic, batchSize);
}
@Override
public void destroy() {
}
@Override
public void startRead(RecordSender recordSender) {
LOG.info("read start");
String beginCursor = DatahubReaderUtils.getCursorWithRetry(this.datahubClient, this.project,
this.topic, this.shardId, this.beginTimestampMillis);
String endCursor = DatahubReaderUtils.getCursorWithRetry(this.datahubClient, this.project,
this.topic, this.shardId, this.endTimestampMillis);
if (beginCursor == null) {
LOG.info("Shard:{} has no data!", this.shardId);
return;
} else if (endCursor == null) {
endCursor = DatahubReaderUtils.getLatestCursorWithRetry(this.datahubClient, this.project,
this.topic, this.shardId);
}
String curCursor = beginCursor;
boolean exit = false;
while (true) {
GetRecordsResult result = DatahubReaderUtils.getRecordsResultWithRetry(this.datahubClient, this.project, this.topic,
this.shardId, this.batchSize, curCursor, this.schema);
List<RecordEntry> records = result.getRecords();
if (records.size() > 0) {
for (RecordEntry record : records) {
if (record.getSystemTime() >= this.endTimestampMillis) {
exit = true;
break;
}
HashMap<String, Column> dataMap = new HashMap<String, Column>();
List<Field> fields = ((TupleRecordData) record.getRecordData()).getRecordSchema().getFields();
for (int i = 0; i < fields.size(); i++) {
Field field = fields.get(i);
Column column = DatahubReaderUtils.getColumnFromField(record, field, this.timeStampUnit);
dataMap.put(field.getName(), column);
}
Record dataxRecord = recordSender.createRecord();
if (null != this.columns && 1 == this.columns.size()) {
String columnsInStr = columns.get(0).toString();
if ("\"*\"".equals(columnsInStr) || "*".equals(columnsInStr)) {
for (int i = 0; i < fields.size(); i++) {
dataxRecord.addColumn(dataMap.get(fields.get(i).getName()));
}
} else {
if (dataMap.containsKey(columnsInStr)) {
dataxRecord.addColumn(dataMap.get(columnsInStr));
} else {
dataxRecord.addColumn(new StringColumn(null));
}
}
} else {
for (String col : this.columns) {
if (dataMap.containsKey(col)) {
dataxRecord.addColumn(dataMap.get(col));
} else {
dataxRecord.addColumn(new StringColumn(null));
}
}
}
recordSender.sendToWriter(dataxRecord);
}
} else {
break;
}
if (exit) {
break;
}
curCursor = result.getNextCursor();
}
LOG.info("end read datahub shard...");
}
}
}

View File

@ -0,0 +1,35 @@
package com.alibaba.datax.plugin.reader.datahubreader;
import com.alibaba.datax.common.spi.ErrorCode;
public enum DatahubReaderErrorCode implements ErrorCode {
BAD_CONFIG_VALUE("DatahubReader-00", "The value you configured is invalid."),
LOG_HUB_ERROR("DatahubReader-01","Datahub exception"),
REQUIRE_VALUE("DatahubReader-02","Missing parameters"),
EMPTY_LOGSTORE_VALUE("DatahubReader-03","There is no shard under this LogStore");
private final String code;
private final String description;
private DatahubReaderErrorCode(String code, String description) {
this.code = code;
this.description = description;
}
@Override
public String getCode() {
return this.code;
}
@Override
public String getDescription() {
return this.description;
}
@Override
public String toString() {
return String.format("Code:[%s], Description:[%s]. ", this.code,
this.description);
}
}

View File

@ -0,0 +1,200 @@
package com.alibaba.datax.plugin.reader.datahubreader;
import java.math.BigDecimal;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.List;
import java.util.concurrent.Callable;
import com.alibaba.datax.common.element.*;
import com.alibaba.datax.common.exception.DataXException;
import com.alibaba.datax.common.util.DataXCaseEnvUtil;
import com.alibaba.datax.common.util.RetryUtil;
import com.aliyun.datahub.client.DatahubClient;
import com.aliyun.datahub.client.exception.InvalidParameterException;
import com.aliyun.datahub.client.model.*;
public class DatahubReaderUtils {
public static long getUnixTimeFromDateTime(String dateTime) throws ParseException {
try {
String format = Constant.DATETIME_FORMAT;
SimpleDateFormat simpleDateFormat = new SimpleDateFormat(format);
return simpleDateFormat.parse(dateTime).getTime();
} catch (ParseException ignored) {
throw DataXException.asDataXException(DatahubReaderErrorCode.BAD_CONFIG_VALUE,
"Invalid DateTime[" + dateTime + "]!");
}
}
public static List<ShardEntry> getShardsWithRetry(final DatahubClient datahubClient, final String project, final String topic) {
List<ShardEntry> shards = null;
try {
shards = RetryUtil.executeWithRetry(new Callable<List<ShardEntry>>() {
@Override
public List<ShardEntry> call() throws Exception {
ListShardResult listShardResult = datahubClient.listShard(project, topic);
return listShardResult.getShards();
}
}, DataXCaseEnvUtil.getRetryTimes(7), DataXCaseEnvUtil.getRetryInterval(1000L), DataXCaseEnvUtil.getRetryExponential(true));
} catch (Exception e) {
throw DataXException.asDataXException(DatahubReaderErrorCode.BAD_CONFIG_VALUE,
"get Shards error, please check ! detail error messsage: " + e.toString());
}
return shards;
}
public static String getCursorWithRetry(final DatahubClient datahubClient, final String project, final String topic,
final String shardId, final long timestamp) {
String cursor;
try {
cursor = RetryUtil.executeWithRetry(new Callable<String>() {
@Override
public String call() throws Exception {
try {
return datahubClient.getCursor(project, topic, shardId, CursorType.SYSTEM_TIME, timestamp).getCursor();
} catch (InvalidParameterException e) {
if (e.getErrorMessage().indexOf("Time in seek request is out of range") >= 0) {
return null;
} else {
throw e;
}
}
}
}, DataXCaseEnvUtil.getRetryTimes(7), DataXCaseEnvUtil.getRetryInterval(1000L), DataXCaseEnvUtil.getRetryExponential(true));
} catch (Exception e) {
throw DataXException.asDataXException(DatahubReaderErrorCode.BAD_CONFIG_VALUE,
"get Cursor error, please check ! detail error messsage: " + e.toString());
}
return cursor;
}
public static String getLatestCursorWithRetry(final DatahubClient datahubClient, final String project, final String topic,
final String shardId) {
String cursor;
try {
cursor = RetryUtil.executeWithRetry(new Callable<String>() {
@Override
public String call() throws Exception {
return datahubClient.getCursor(project, topic, shardId, CursorType.LATEST).getCursor();
}
}, DataXCaseEnvUtil.getRetryTimes(7), DataXCaseEnvUtil.getRetryInterval(1000L), DataXCaseEnvUtil.getRetryExponential(true));
} catch (Exception e) {
throw DataXException.asDataXException(DatahubReaderErrorCode.BAD_CONFIG_VALUE,
"get Cursor error, please check ! detail error messsage: " + e.toString());
}
return cursor;
}
public static RecordSchema getDatahubSchemaWithRetry(final DatahubClient datahubClient, final String project, final String topic) {
RecordSchema schema;
try {
schema = RetryUtil.executeWithRetry(new Callable<RecordSchema>() {
@Override
public RecordSchema call() throws Exception {
return datahubClient.getTopic(project, topic).getRecordSchema();
}
}, DataXCaseEnvUtil.getRetryTimes(7), DataXCaseEnvUtil.getRetryInterval(1000L), DataXCaseEnvUtil.getRetryExponential(true));
} catch (Exception e) {
throw DataXException.asDataXException(DatahubReaderErrorCode.BAD_CONFIG_VALUE,
"get Topic Schema error, please check ! detail error messsage: " + e.toString());
}
return schema;
}
public static GetRecordsResult getRecordsResultWithRetry(final DatahubClient datahubClient, final String project,
final String topic, final String shardId, final int batchSize, final String cursor, final RecordSchema schema) {
GetRecordsResult result;
try {
result = RetryUtil.executeWithRetry(new Callable<GetRecordsResult>() {
@Override
public GetRecordsResult call() throws Exception {
return datahubClient.getRecords(project, topic, shardId, schema, cursor, batchSize);
}
}, DataXCaseEnvUtil.getRetryTimes(7), DataXCaseEnvUtil.getRetryInterval(1000L), DataXCaseEnvUtil.getRetryExponential(true));
} catch (Exception e) {
throw DataXException.asDataXException(DatahubReaderErrorCode.BAD_CONFIG_VALUE,
"get Record Result error, please check ! detail error messsage: " + e.toString());
}
return result;
}
public static Column getColumnFromField(RecordEntry record, Field field, String timeStampUnit) {
Column col = null;
TupleRecordData o = (TupleRecordData) record.getRecordData();
switch (field.getType()) {
case SMALLINT:
Short shortValue = ((Short) o.getField(field.getName()));
col = new LongColumn(shortValue == null ? null: shortValue.longValue());
break;
case INTEGER:
col = new LongColumn((Integer) o.getField(field.getName()));
break;
case BIGINT: {
col = new LongColumn((Long) o.getField(field.getName()));
break;
}
case TINYINT: {
Byte byteValue = ((Byte) o.getField(field.getName()));
col = new LongColumn(byteValue == null ? null : byteValue.longValue());
break;
}
case BOOLEAN: {
col = new BoolColumn((Boolean) o.getField(field.getName()));
break;
}
case FLOAT:
col = new DoubleColumn((Float) o.getField(field.getName()));
break;
case DOUBLE: {
col = new DoubleColumn((Double) o.getField(field.getName()));
break;
}
case STRING: {
col = new StringColumn((String) o.getField(field.getName()));
break;
}
case DECIMAL: {
BigDecimal value = (BigDecimal) o.getField(field.getName());
col = new DoubleColumn(value == null ? null : value.doubleValue());
break;
}
case TIMESTAMP: {
Long value = (Long) o.getField(field.getName());
if ("MILLISECOND".equals(timeStampUnit)) {
// MILLISECOND, 13位精度直接 new Date()
col = new DateColumn(value == null ? null : new Date(value));
}
else if ("SECOND".equals(timeStampUnit)){
col = new DateColumn(value == null ? null : new Date(value * 1000));
}
else {
// 默认都是 MICROSECOND, 16位精度 和之前的逻辑保持一致
col = new DateColumn(value == null ? null : new Date(value / 1000));
}
break;
}
default:
throw new RuntimeException("Unknown column type: " + field.getType());
}
return col;
}
}

View File

@ -0,0 +1,37 @@
package com.alibaba.datax.plugin.reader.datahubreader;
import com.alibaba.datax.common.spi.ErrorCode;
import com.alibaba.datax.common.util.MessageSource;
public enum DatahubWriterErrorCode implements ErrorCode {
MISSING_REQUIRED_VALUE("DatahubWriter-01", MessageSource.loadResourceBundle(DatahubWriterErrorCode.class).message("errorcode.missing_required_value")),
INVALID_CONFIG_VALUE("DatahubWriter-02", MessageSource.loadResourceBundle(DatahubWriterErrorCode.class).message("errorcode.invalid_config_value")),
GET_TOPOIC_INFO_FAIL("DatahubWriter-03", MessageSource.loadResourceBundle(DatahubWriterErrorCode.class).message("errorcode.get_topic_info_fail")),
WRITE_DATAHUB_FAIL("DatahubWriter-04", MessageSource.loadResourceBundle(DatahubWriterErrorCode.class).message("errorcode.write_datahub_fail")),
SCHEMA_NOT_MATCH("DatahubWriter-05", MessageSource.loadResourceBundle(DatahubWriterErrorCode.class).message("errorcode.schema_not_match")),
;
private final String code;
private final String description;
private DatahubWriterErrorCode(String code, String description) {
this.code = code;
this.description = description;
}
@Override
public String getCode() {
return this.code;
}
@Override
public String getDescription() {
return this.description;
}
@Override
public String toString() {
return String.format("Code:[%s], Description:[%s]. ", this.code,
this.description);
}
}

View File

@ -0,0 +1,35 @@
package com.alibaba.datax.plugin.reader.datahubreader;
public final class Key {
/**
* 此处声明插件用到的需要插件使用者提供的配置项
*/
public static final String ENDPOINT = "endpoint";
public static final String ACCESSKEYID = "accessId";
public static final String ACCESSKEYSECRET = "accessKey";
public static final String PROJECT = "project";
public static final String TOPIC = "topic";
public static final String BEGINDATETIME = "beginDateTime";
public static final String ENDDATETIME = "endDateTime";
public static final String BATCHSIZE = "batchSize";
public static final String COLUMN = "column";
public static final String SHARDID = "shardId";
public static final String CONFIG_KEY_ENDPOINT = "endpoint";
public static final String CONFIG_KEY_ACCESS_ID = "accessId";
public static final String CONFIG_KEY_ACCESS_KEY = "accessKey";
public static final String TIMESTAMP_UNIT = "timeStampUnit";
}

View File

@ -0,0 +1,5 @@
errorcode.missing_required_value=\u60A8\u7F3A\u5931\u4E86\u5FC5\u987B\u586B\u5199\u7684\u53C2\u6570\u503C.
errorcode.invalid_config_value=\u60A8\u7684\u53C2\u6570\u914D\u7F6E\u9519\u8BEF.
errorcode.get_topic_info_fail=\u83B7\u53D6shard\u5217\u8868\u5931\u8D25.
errorcode.write_datahub_fail=\u5199\u6570\u636E\u5931\u8D25.
errorcode.schema_not_match=\u6570\u636E\u683C\u5F0F\u9519\u8BEF.

View File

@ -0,0 +1,5 @@
errorcode.missing_required_value=\u60A8\u7F3A\u5931\u4E86\u5FC5\u987B\u586B\u5199\u7684\u53C2\u6570\u503C.
errorcode.invalid_config_value=\u60A8\u7684\u53C2\u6570\u914D\u7F6E\u9519\u8BEF.
errorcode.get_topic_info_fail=\u83B7\u53D6shard\u5217\u8868\u5931\u8D25.
errorcode.write_datahub_fail=\u5199\u6570\u636E\u5931\u8D25.
errorcode.schema_not_match=\u6570\u636E\u683C\u5F0F\u9519\u8BEF.

View File

@ -0,0 +1,5 @@
errorcode.missing_required_value=\u60A8\u7F3A\u5931\u4E86\u5FC5\u987B\u586B\u5199\u7684\u53C2\u6570\u503C.
errorcode.invalid_config_value=\u60A8\u7684\u53C2\u6570\u914D\u7F6E\u9519\u8BEF.
errorcode.get_topic_info_fail=\u83B7\u53D6shard\u5217\u8868\u5931\u8D25.
errorcode.write_datahub_fail=\u5199\u6570\u636E\u5931\u8D25.
errorcode.schema_not_match=\u6570\u636E\u683C\u5F0F\u9519\u8BEF.

View File

@ -0,0 +1,5 @@
errorcode.missing_required_value=\u60A8\u7F3A\u5931\u4E86\u5FC5\u987B\u586B\u5199\u7684\u53C2\u6570\u503C.
errorcode.invalid_config_value=\u60A8\u7684\u53C2\u6570\u914D\u7F6E\u9519\u8BEF.
errorcode.get_topic_info_fail=\u83B7\u53D6shard\u5217\u8868\u5931\u8D25.
errorcode.write_datahub_fail=\u5199\u6570\u636E\u5931\u8D25.
errorcode.schema_not_match=\u6570\u636E\u683C\u5F0F\u9519\u8BEF.

View File

@ -0,0 +1,9 @@
errorcode.missing_required_value=\u60A8\u7F3A\u5931\u4E86\u5FC5\u987B\u586B\u5199\u7684\u53C2\u6570\u503C.
errorcode.invalid_config_value=\u60A8\u7684\u53C2\u6570\u914D\u7F6E\u9519\u8BEF.
errorcode.get_topic_info_fail=\u83B7\u53D6shard\u5217\u8868\u5931\u8D25.
errorcode.write_datahub_fail=\u5199\u6570\u636E\u5931\u8D25.
errorcode.schema_not_match=\u6570\u636E\u683C\u5F0F\u9519\u8BEF.errorcode.missing_required_value=您缺失了必須填寫的參數值.
errorcode.invalid_config_value=您的參數配寘錯誤.
errorcode.get_topic_info_fail=獲取shard清單失敗.
errorcode.write_datahub_fail=寫數據失敗.
errorcode.schema_not_match=數據格式錯誤.

View File

@ -0,0 +1,9 @@
errorcode.missing_required_value=\u60A8\u7F3A\u5931\u4E86\u5FC5\u987B\u586B\u5199\u7684\u53C2\u6570\u503C.
errorcode.invalid_config_value=\u60A8\u7684\u53C2\u6570\u914D\u7F6E\u9519\u8BEF.
errorcode.get_topic_info_fail=\u83B7\u53D6shard\u5217\u8868\u5931\u8D25.
errorcode.write_datahub_fail=\u5199\u6570\u636E\u5931\u8D25.
errorcode.schema_not_match=\u6570\u636E\u683C\u5F0F\u9519\u8BEF.errorcode.missing_required_value=您缺失了必須填寫的參數值.
errorcode.invalid_config_value=您的參數配寘錯誤.
errorcode.get_topic_info_fail=獲取shard清單失敗.
errorcode.write_datahub_fail=寫數據失敗.
errorcode.schema_not_match=數據格式錯誤.

View File

@ -0,0 +1,14 @@
{
"name": "datahubreader",
"parameter": {
"endpoint":"",
"accessId": "",
"accessKey": "",
"project": "",
"topic": "",
"beginDateTime": "20180913121019",
"endDateTime": "20180913121119",
"batchSize": 1024,
"column": []
}
}

View File

@ -0,0 +1,6 @@
{
"name": "datahubreader",
"class": "com.alibaba.datax.plugin.reader.datahubreader.DatahubReader",
"description": "datahub reader",
"developer": "alibaba"
}

79
datahubwriter/pom.xml Normal file
View File

@ -0,0 +1,79 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>datax-all</artifactId>
<groupId>com.alibaba.datax</groupId>
<version>0.0.1-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>datahubwriter</artifactId>
<version>0.0.1-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>com.alibaba.datax</groupId>
<artifactId>datax-common</artifactId>
<version>${datax-project-version}</version>
<exclusions>
<exclusion>
<artifactId>slf4j-log4j12</artifactId>
<groupId>org.slf4j</groupId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
</dependency>
<dependency>
<groupId>ch.qos.logback</groupId>
<artifactId>logback-classic</artifactId>
</dependency>
<dependency>
<groupId>com.aliyun.datahub</groupId>
<artifactId>aliyun-sdk-datahub</artifactId>
<version>2.21.6-public</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
<scope>test</scope>
</dependency>
</dependencies>
<build>
<plugins>
<!-- compiler plugin -->
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>${jdk-version}</source>
<target>${jdk-version}</target>
<encoding>${project-sourceEncoding}</encoding>
</configuration>
</plugin>
<!-- assembly plugin -->
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<descriptors>
<descriptor>src/main/assembly/package.xml</descriptor>
</descriptors>
<finalName>datax</finalName>
</configuration>
<executions>
<execution>
<id>dwzip</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>

View File

@ -0,0 +1,34 @@
<assembly
xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0 http://maven.apache.org/xsd/assembly-1.1.0.xsd">
<id></id>
<formats>
<format>dir</format>
</formats>
<includeBaseDirectory>false</includeBaseDirectory>
<fileSets>
<fileSet>
<directory>src/main/resources</directory>
<includes>
<include>plugin.json</include>
</includes>
<outputDirectory>plugin/writer/datahubwriter</outputDirectory>
</fileSet>
<fileSet>
<directory>target/</directory>
<includes>
<include>datahubwriter-0.0.1-SNAPSHOT.jar</include>
</includes>
<outputDirectory>plugin/writer/datahubwriter</outputDirectory>
</fileSet>
</fileSets>
<dependencySets>
<dependencySet>
<useProjectArtifact>false</useProjectArtifact>
<outputDirectory>plugin/writer/datahubwriter/libs</outputDirectory>
<scope>runtime</scope>
</dependencySet>
</dependencySets>
</assembly>

View File

@ -0,0 +1,43 @@
package com.alibaba.datax.plugin.writer.datahubwriter;
import org.apache.commons.lang3.StringUtils;
import com.alibaba.datax.common.util.Configuration;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.TypeReference;
import com.aliyun.datahub.client.DatahubClient;
import com.aliyun.datahub.client.DatahubClientBuilder;
import com.aliyun.datahub.client.auth.Account;
import com.aliyun.datahub.client.auth.AliyunAccount;
import com.aliyun.datahub.client.common.DatahubConfig;
import com.aliyun.datahub.client.http.HttpConfig;
public class DatahubClientHelper {
public static DatahubClient getDatahubClient(Configuration jobConfig) {
String accessId = jobConfig.getNecessaryValue(Key.CONFIG_KEY_ACCESS_ID,
DatahubWriterErrorCode.MISSING_REQUIRED_VALUE);
String accessKey = jobConfig.getNecessaryValue(Key.CONFIG_KEY_ACCESS_KEY,
DatahubWriterErrorCode.MISSING_REQUIRED_VALUE);
String endpoint = jobConfig.getNecessaryValue(Key.CONFIG_KEY_ENDPOINT,
DatahubWriterErrorCode.MISSING_REQUIRED_VALUE);
Account account = new AliyunAccount(accessId, accessKey);
// 是否开启二进制传输服务端2.12版本开始支持
boolean enableBinary = jobConfig.getBool("enableBinary", false);
DatahubConfig datahubConfig = new DatahubConfig(endpoint, account, enableBinary);
// HttpConfig可不设置不设置时采用默认值
// 读写数据推荐打开网络传输 LZ4压缩
HttpConfig httpConfig = null;
String httpConfigStr = jobConfig.getString("httpConfig");
if (StringUtils.isNotBlank(httpConfigStr)) {
httpConfig = JSON.parseObject(httpConfigStr, new TypeReference<HttpConfig>() {
});
}
DatahubClientBuilder builder = DatahubClientBuilder.newBuilder().setDatahubConfig(datahubConfig);
if (null != httpConfig) {
builder.setHttpConfig(httpConfig);
}
DatahubClient datahubClient = builder.build();
return datahubClient;
}
}

View File

@ -0,0 +1,355 @@
package com.alibaba.datax.plugin.writer.datahubwriter;
import com.alibaba.datax.common.element.Column;
import com.alibaba.datax.common.element.Record;
import com.alibaba.datax.common.exception.DataXException;
import com.alibaba.datax.common.plugin.RecordReceiver;
import com.alibaba.datax.common.spi.Writer;
import com.alibaba.datax.common.util.Configuration;
import com.alibaba.datax.common.util.DataXCaseEnvUtil;
import com.alibaba.datax.common.util.RetryUtil;
import com.alibaba.fastjson.JSON;
import com.aliyun.datahub.client.DatahubClient;
import com.aliyun.datahub.client.model.FieldType;
import com.aliyun.datahub.client.model.GetTopicResult;
import com.aliyun.datahub.client.model.ListShardResult;
import com.aliyun.datahub.client.model.PutErrorEntry;
import com.aliyun.datahub.client.model.PutRecordsResult;
import com.aliyun.datahub.client.model.RecordEntry;
import com.aliyun.datahub.client.model.RecordSchema;
import com.aliyun.datahub.client.model.RecordType;
import com.aliyun.datahub.client.model.ShardEntry;
import com.aliyun.datahub.client.model.ShardState;
import com.aliyun.datahub.client.model.TupleRecordData;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Random;
import java.util.concurrent.Callable;
public class DatahubWriter extends Writer {
/**
* Job 中的方法仅执行一次Task 中方法会由框架启动多个 Task 线程并行执行
* <p/>
* 整个 Writer 执行流程是
* <pre>
* Job类init-->prepare-->split
*
* Task类init-->prepare-->startWrite-->post-->destroy
* Task类init-->prepare-->startWrite-->post-->destroy
*
* Job类post-->destroy
* </pre>
*/
public static class Job extends Writer.Job {
private static final Logger LOG = LoggerFactory
.getLogger(Job.class);
private Configuration jobConfig = null;
@Override
public void init() {
this.jobConfig = super.getPluginJobConf();
jobConfig.getNecessaryValue(Key.CONFIG_KEY_ENDPOINT, DatahubWriterErrorCode.MISSING_REQUIRED_VALUE);
jobConfig.getNecessaryValue(Key.CONFIG_KEY_ACCESS_ID, DatahubWriterErrorCode.MISSING_REQUIRED_VALUE);
jobConfig.getNecessaryValue(Key.CONFIG_KEY_ACCESS_KEY, DatahubWriterErrorCode.MISSING_REQUIRED_VALUE);
jobConfig.getNecessaryValue(Key.CONFIG_KEY_PROJECT, DatahubWriterErrorCode.MISSING_REQUIRED_VALUE);
jobConfig.getNecessaryValue(Key.CONFIG_KEY_TOPIC, DatahubWriterErrorCode.MISSING_REQUIRED_VALUE);
}
@Override
public void prepare() {
String project = jobConfig.getNecessaryValue(Key.CONFIG_KEY_PROJECT,
DatahubWriterErrorCode.MISSING_REQUIRED_VALUE);
String topic = jobConfig.getNecessaryValue(Key.CONFIG_KEY_TOPIC,
DatahubWriterErrorCode.MISSING_REQUIRED_VALUE);
RecordType recordType = null;
DatahubClient client = DatahubClientHelper.getDatahubClient(this.jobConfig);
try {
GetTopicResult getTopicResult = client.getTopic(project, topic);
recordType = getTopicResult.getRecordType();
} catch (Exception e) {
LOG.warn("get topic type error: {}", e.getMessage());
}
if (null != recordType) {
if (recordType == RecordType.BLOB) {
throw DataXException.asDataXException(DatahubWriterErrorCode.WRITE_DATAHUB_FAIL,
"DatahubWriter only support 'Tuple' RecordType now, but your RecordType is 'BLOB'");
}
}
}
@Override
public List<Configuration> split(int mandatoryNumber) {
List<Configuration> configs = new ArrayList<Configuration>();
for (int i = 0; i < mandatoryNumber; ++i) {
configs.add(jobConfig.clone());
}
return configs;
}
@Override
public void post() {}
@Override
public void destroy() {}
}
public static class Task extends Writer.Task {
private static final Logger LOG = LoggerFactory
.getLogger(Task.class);
private static final List<String> FATAL_ERRORS_DEFAULT = Arrays.asList(
"InvalidParameterM",
"MalformedRecord",
"INVALID_SHARDID",
"NoSuchTopic",
"NoSuchShard"
);
private Configuration taskConfig;
private DatahubClient client;
private String project;
private String topic;
private List<String> shards;
private int maxCommitSize;
private int maxRetryCount;
private RecordSchema schema;
private long retryInterval;
private Random random;
private List<String> column;
private List<Integer> columnIndex;
private boolean enableColumnConfig;
private List<String> fatalErrors;
@Override
public void init() {
this.taskConfig = super.getPluginJobConf();
project = taskConfig.getNecessaryValue(Key.CONFIG_KEY_PROJECT, DatahubWriterErrorCode.MISSING_REQUIRED_VALUE);
topic = taskConfig.getNecessaryValue(Key.CONFIG_KEY_TOPIC, DatahubWriterErrorCode.MISSING_REQUIRED_VALUE);
maxCommitSize = taskConfig.getInt(Key.CONFIG_KEY_MAX_COMMIT_SIZE, 1024*1024);
maxRetryCount = taskConfig.getInt(Key.CONFIG_KEY_MAX_RETRY_COUNT, 500);
this.retryInterval = taskConfig.getInt(Key.RETRY_INTERVAL, 650);
this.random = new Random();
this.column = this.taskConfig.getList(Key.CONFIG_KEY_COLUMN, String.class);
// ["*"]
if (null != this.column && 1 == this.column.size()) {
if (StringUtils.equals("*", this.column.get(0))) {
this.column = null;
}
}
this.columnIndex = new ArrayList<Integer>();
// 留个开关保平安
this.enableColumnConfig = this.taskConfig.getBool("enableColumnConfig", true);
this.fatalErrors = this.taskConfig.getList("fatalErrors", Task.FATAL_ERRORS_DEFAULT, String.class);
this.client = DatahubClientHelper.getDatahubClient(this.taskConfig);
}
@Override
public void prepare() {
final String shardIdConfig = this.taskConfig.getString(Key.CONFIG_KEY_SHARD_ID);
this.shards = new ArrayList<String>();
try {
RetryUtil.executeWithRetry(new Callable<Void>() {
@Override
public Void call() throws Exception {
ListShardResult result = client.listShard(project, topic);
if (StringUtils.isNotBlank(shardIdConfig)) {
shards.add(shardIdConfig);
} else {
for (ShardEntry shard : result.getShards()) {
if (shard.getState() == ShardState.ACTIVE || shard.getState() == ShardState.OPENING) {
shards.add(shard.getShardId());
}
}
}
schema = client.getTopic(project, topic).getRecordSchema();
return null;
}
}, DataXCaseEnvUtil.getRetryTimes(5), DataXCaseEnvUtil.getRetryInterval(10000L), DataXCaseEnvUtil.getRetryExponential(false));
} catch (Exception e) {
throw DataXException.asDataXException(DatahubWriterErrorCode.GET_TOPOIC_INFO_FAIL,
"get topic info failed", e);
}
LOG.info("datahub topic {} shard to write: {}", this.topic, JSON.toJSONString(this.shards));
LOG.info("datahub topic {} has schema: {}", this.topic, JSON.toJSONString(this.schema));
// 根据 schmea 顺序 和用户配置的 column计算写datahub的顺序关系以支持列换序
// 后续统一使用 columnIndex 的顺位关系写 datahub
int totalSize = this.schema.getFields().size();
if (null != this.column && !this.column.isEmpty() && this.enableColumnConfig) {
for (String eachCol : this.column) {
int indexFound = -1;
for (int i = 0; i < totalSize; i++) {
// warn: 大小写ignore
if (StringUtils.equalsIgnoreCase(eachCol, this.schema.getField(i).getName())) {
indexFound = i;
break;
}
}
if (indexFound >= 0) {
this.columnIndex.add(indexFound);
} else {
throw DataXException.asDataXException(DatahubWriterErrorCode.SCHEMA_NOT_MATCH,
String.format("can not find column %s in datahub topic %s", eachCol, this.topic));
}
}
} else {
for (int i = 0; i < totalSize; i++) {
this.columnIndex.add(i);
}
}
}
@Override
public void startWrite(RecordReceiver recordReceiver) {
Record record;
List<RecordEntry> records = new ArrayList<RecordEntry>();
String shardId = null;
if (1 == this.shards.size()) {
shardId = shards.get(0);
} else {
shardId = shards.get(this.random.nextInt(shards.size()));
}
int commitSize = 0;
try {
while ((record = recordReceiver.getFromReader()) != null) {
RecordEntry dhRecord = convertRecord(record, shardId);
if (dhRecord != null) {
records.add(dhRecord);
}
commitSize += record.getByteSize();
if (commitSize >= maxCommitSize) {
commit(records);
records.clear();
commitSize = 0;
if (1 == this.shards.size()) {
shardId = shards.get(0);
} else {
shardId = shards.get(this.random.nextInt(shards.size()));
}
}
}
if (commitSize > 0) {
commit(records);
}
} catch (Exception e) {
throw DataXException.asDataXException(
DatahubWriterErrorCode.WRITE_DATAHUB_FAIL, e);
}
}
@Override
public void post() {}
@Override
public void destroy() {}
private void commit(List<RecordEntry> records) throws InterruptedException {
PutRecordsResult result = client.putRecords(project, topic, records);
if (result.getFailedRecordCount() > 0) {
for (int i = 0; i < maxRetryCount; ++i) {
boolean limitExceededMessagePrinted = false;
for (PutErrorEntry error : result.getPutErrorEntries()) {
// 如果是 LimitExceeded 这样打印日志不能每行记录打印一次了
if (StringUtils.equalsIgnoreCase("LimitExceeded", error.getErrorcode())) {
if (!limitExceededMessagePrinted) {
LOG.warn("write record error, request id: {}, error code: {}, error message: {}",
result.getRequestId(), error.getErrorcode(), error.getMessage());
limitExceededMessagePrinted = true;
}
} else {
LOG.error("write record error, request id: {}, error code: {}, error message: {}",
result.getRequestId(), error.getErrorcode(), error.getMessage());
}
if (this.fatalErrors.contains(error.getErrorcode())) {
throw DataXException.asDataXException(
DatahubWriterErrorCode.WRITE_DATAHUB_FAIL,
error.getMessage());
}
}
if (this.retryInterval >= 0) {
Thread.sleep(this.retryInterval);
} else {
Thread.sleep(new Random().nextInt(700) + 300);
}
result = client.putRecords(project, topic, result.getFailedRecords());
if (result.getFailedRecordCount() == 0) {
return;
}
}
throw DataXException.asDataXException(
DatahubWriterErrorCode.WRITE_DATAHUB_FAIL,
"write datahub failed");
}
}
private RecordEntry convertRecord(Record dxRecord, String shardId) {
try {
RecordEntry dhRecord = new RecordEntry();
dhRecord.setShardId(shardId);
TupleRecordData data = new TupleRecordData(this.schema);
for (int i = 0; i < this.columnIndex.size(); ++i) {
int orderInSchema = this.columnIndex.get(i);
FieldType type = this.schema.getField(orderInSchema).getType();
Column column = dxRecord.getColumn(i);
switch (type) {
case BIGINT:
data.setField(orderInSchema, column.asLong());
break;
case DOUBLE:
data.setField(orderInSchema, column.asDouble());
break;
case STRING:
data.setField(orderInSchema, column.asString());
break;
case BOOLEAN:
data.setField(orderInSchema, column.asBoolean());
break;
case TIMESTAMP:
if (null == column.asDate()) {
data.setField(orderInSchema, null);
} else {
data.setField(orderInSchema, column.asDate().getTime() * 1000);
}
break;
case DECIMAL:
// warn
data.setField(orderInSchema, column.asBigDecimal());
break;
case INTEGER:
data.setField(orderInSchema, column.asLong());
break;
case FLOAT:
data.setField(orderInSchema, column.asDouble());
break;
case TINYINT:
data.setField(orderInSchema, column.asLong());
break;
case SMALLINT:
data.setField(orderInSchema, column.asLong());
break;
default:
throw DataXException.asDataXException(
DatahubWriterErrorCode.SCHEMA_NOT_MATCH,
String.format("does not support type: %s", type));
}
}
dhRecord.setRecordData(data);
return dhRecord;
} catch (Exception e) {
super.getTaskPluginCollector().collectDirtyRecord(dxRecord, e, "convert recor failed");
}
return null;
}
}
}

View File

@ -0,0 +1,37 @@
package com.alibaba.datax.plugin.writer.datahubwriter;
import com.alibaba.datax.common.spi.ErrorCode;
import com.alibaba.datax.common.util.MessageSource;
public enum DatahubWriterErrorCode implements ErrorCode {
MISSING_REQUIRED_VALUE("DatahubWriter-01", MessageSource.loadResourceBundle(DatahubWriterErrorCode.class).message("errorcode.missing_required_value")),
INVALID_CONFIG_VALUE("DatahubWriter-02", MessageSource.loadResourceBundle(DatahubWriterErrorCode.class).message("errorcode.invalid_config_value")),
GET_TOPOIC_INFO_FAIL("DatahubWriter-03", MessageSource.loadResourceBundle(DatahubWriterErrorCode.class).message("errorcode.get_topic_info_fail")),
WRITE_DATAHUB_FAIL("DatahubWriter-04", MessageSource.loadResourceBundle(DatahubWriterErrorCode.class).message("errorcode.write_datahub_fail")),
SCHEMA_NOT_MATCH("DatahubWriter-05", MessageSource.loadResourceBundle(DatahubWriterErrorCode.class).message("errorcode.schema_not_match")),
;
private final String code;
private final String description;
private DatahubWriterErrorCode(String code, String description) {
this.code = code;
this.description = description;
}
@Override
public String getCode() {
return this.code;
}
@Override
public String getDescription() {
return this.description;
}
@Override
public String toString() {
return String.format("Code:[%s], Description:[%s]. ", this.code,
this.description);
}
}

View File

@ -0,0 +1,26 @@
package com.alibaba.datax.plugin.writer.datahubwriter;
public final class Key {
/**
* 此处声明插件用到的需要插件使用者提供的配置项
*/
public static final String CONFIG_KEY_ENDPOINT = "endpoint";
public static final String CONFIG_KEY_ACCESS_ID = "accessId";
public static final String CONFIG_KEY_ACCESS_KEY = "accessKey";
public static final String CONFIG_KEY_PROJECT = "project";
public static final String CONFIG_KEY_TOPIC = "topic";
public static final String CONFIG_KEY_WRITE_MODE = "mode";
public static final String CONFIG_KEY_SHARD_ID = "shardId";
public static final String CONFIG_KEY_MAX_COMMIT_SIZE = "maxCommitSize";
public static final String CONFIG_KEY_MAX_RETRY_COUNT = "maxRetryCount";
public static final String CONFIG_VALUE_SEQUENCE_MODE = "sequence";
public static final String CONFIG_VALUE_RANDOM_MODE = "random";
public final static String MAX_RETRY_TIME = "maxRetryTime";
public final static String RETRY_INTERVAL = "retryInterval";
public final static String CONFIG_KEY_COLUMN = "column";
}

View File

@ -0,0 +1,5 @@
errorcode.missing_required_value=\u60A8\u7F3A\u5931\u4E86\u5FC5\u987B\u586B\u5199\u7684\u53C2\u6570\u503C.
errorcode.invalid_config_value=\u60A8\u7684\u53C2\u6570\u914D\u7F6E\u9519\u8BEF.
errorcode.get_topic_info_fail=\u83B7\u53D6shard\u5217\u8868\u5931\u8D25.
errorcode.write_datahub_fail=\u5199\u6570\u636E\u5931\u8D25.
errorcode.schema_not_match=\u6570\u636E\u683C\u5F0F\u9519\u8BEF.

View File

@ -0,0 +1,5 @@
errorcode.missing_required_value=\u60A8\u7F3A\u5931\u4E86\u5FC5\u987B\u586B\u5199\u7684\u53C2\u6570\u503C.
errorcode.invalid_config_value=\u60A8\u7684\u53C2\u6570\u914D\u7F6E\u9519\u8BEF.
errorcode.get_topic_info_fail=\u83B7\u53D6shard\u5217\u8868\u5931\u8D25.
errorcode.write_datahub_fail=\u5199\u6570\u636E\u5931\u8D25.
errorcode.schema_not_match=\u6570\u636E\u683C\u5F0F\u9519\u8BEF.

View File

@ -0,0 +1,5 @@
errorcode.missing_required_value=\u60A8\u7F3A\u5931\u4E86\u5FC5\u987B\u586B\u5199\u7684\u53C2\u6570\u503C.
errorcode.invalid_config_value=\u60A8\u7684\u53C2\u6570\u914D\u7F6E\u9519\u8BEF.
errorcode.get_topic_info_fail=\u83B7\u53D6shard\u5217\u8868\u5931\u8D25.
errorcode.write_datahub_fail=\u5199\u6570\u636E\u5931\u8D25.
errorcode.schema_not_match=\u6570\u636E\u683C\u5F0F\u9519\u8BEF.

View File

@ -0,0 +1,5 @@
errorcode.missing_required_value=\u60A8\u7F3A\u5931\u4E86\u5FC5\u987B\u586B\u5199\u7684\u53C2\u6570\u503C.
errorcode.invalid_config_value=\u60A8\u7684\u53C2\u6570\u914D\u7F6E\u9519\u8BEF.
errorcode.get_topic_info_fail=\u83B7\u53D6shard\u5217\u8868\u5931\u8D25.
errorcode.write_datahub_fail=\u5199\u6570\u636E\u5931\u8D25.
errorcode.schema_not_match=\u6570\u636E\u683C\u5F0F\u9519\u8BEF.

View File

@ -0,0 +1,9 @@
errorcode.missing_required_value=\u60A8\u7F3A\u5931\u4E86\u5FC5\u987B\u586B\u5199\u7684\u53C2\u6570\u503C.
errorcode.invalid_config_value=\u60A8\u7684\u53C2\u6570\u914D\u7F6E\u9519\u8BEF.
errorcode.get_topic_info_fail=\u83B7\u53D6shard\u5217\u8868\u5931\u8D25.
errorcode.write_datahub_fail=\u5199\u6570\u636E\u5931\u8D25.
errorcode.schema_not_match=\u6570\u636E\u683C\u5F0F\u9519\u8BEF.errorcode.missing_required_value=您缺失了必須填寫的參數值.
errorcode.invalid_config_value=您的參數配寘錯誤.
errorcode.get_topic_info_fail=獲取shard清單失敗.
errorcode.write_datahub_fail=寫數據失敗.
errorcode.schema_not_match=數據格式錯誤.

View File

@ -0,0 +1,9 @@
errorcode.missing_required_value=\u60A8\u7F3A\u5931\u4E86\u5FC5\u987B\u586B\u5199\u7684\u53C2\u6570\u503C.
errorcode.invalid_config_value=\u60A8\u7684\u53C2\u6570\u914D\u7F6E\u9519\u8BEF.
errorcode.get_topic_info_fail=\u83B7\u53D6shard\u5217\u8868\u5931\u8D25.
errorcode.write_datahub_fail=\u5199\u6570\u636E\u5931\u8D25.
errorcode.schema_not_match=\u6570\u636E\u683C\u5F0F\u9519\u8BEF.errorcode.missing_required_value=您缺失了必須填寫的參數值.
errorcode.invalid_config_value=您的參數配寘錯誤.
errorcode.get_topic_info_fail=獲取shard清單失敗.
errorcode.write_datahub_fail=寫數據失敗.
errorcode.schema_not_match=數據格式錯誤.

View File

@ -0,0 +1,14 @@
{
"name": "datahubwriter",
"parameter": {
"endpoint":"",
"accessId": "",
"accessKey": "",
"project": "",
"topic": "",
"mode": "random",
"shardId": "",
"maxCommitSize": 524288,
"maxRetryCount": 500
}
}

View File

@ -0,0 +1,6 @@
{
"name": "datahubwriter",
"class": "com.alibaba.datax.plugin.writer.datahubwriter.DatahubWriter",
"description": "datahub writer",
"developer": "alibaba"
}

View File

@ -0,0 +1,181 @@
# DorisWriter 插件文档
## 1 快速介绍
DorisWriter支持将大批量数据写入Doris中。
## 2 实现原理
DorisWriter 通过Doris原生支持Stream load方式导入数据 DorisWriter会将`reader`读取的数据进行缓存在内存中拼接成Json文本然后批量导入至Doris。
## 3 功能说明
### 3.1 配置样例
这里是一份从Stream读取数据后导入至Doris的配置文件。
```
{
"job": {
"content": [
{
"reader": {
"name": "mysqlreader",
"parameter": {
"column": ["emp_no", "birth_date", "first_name","last_name","gender","hire_date"],
"connection": [
{
"jdbcUrl": ["jdbc:mysql://localhost:3306/demo"],
"table": ["employees_1"]
}
],
"username": "root",
"password": "xxxxx",
"where": ""
}
},
"writer": {
"name": "doriswriter",
"parameter": {
"loadUrl": ["172.16.0.13:8030"],
"loadProps": {
},
"column": ["emp_no", "birth_date", "first_name","last_name","gender","hire_date"],
"username": "root",
"password": "xxxxxx",
"postSql": ["select count(1) from all_employees_info"],
"preSql": [],
"flushInterval":30000,
"connection": [
{
"jdbcUrl": "jdbc:mysql://172.16.0.13:9030/demo",
"selectedDatabase": "demo",
"table": ["all_employees_info"]
}
],
"loadProps": {
"format": "json",
"strip_outer_array": true
}
}
}
}
],
"setting": {
"speed": {
"channel": "1"
}
}
}
}
```
### 3.2 参数说明
* **jdbcUrl**
- 描述Doris 的 JDBC 连接串,用户执行 preSql 或 postSQL。
- 必选:是
- 默认值:无
* **loadUrl**
- 描述:作为 Stream Load 的连接目标。格式为 "ip:port"。其中 IP 是 FE 节点 IPport 是 FE 节点的 http_port。可以填写多个多个之间使用英文状态的分号隔开:`;`doriswriter 将以轮询的方式访问。
- 必选:是
- 默认值:无
* **username**
- 描述访问Doris数据库的用户名
- 必选:是
- 默认值:无
* **password**
- 描述访问Doris数据库的密码
- 必选:否
- 默认值:空
* **connection.selectedDatabase**
- 描述需要写入的Doris数据库名称。
- 必选:是
- 默认值:无
* **connection.table**
- 描述需要写入的Doris表名称。
- 必选:是
- 默认值:无
* **column**
- 描述:目的表**需要写入数据**的字段,这些字段将作为生成的 Json 数据的字段名。字段之间用英文逗号分隔。例如: "column": ["id","name","age"]。
- 必选:是
- 默认值:否
* **preSql**
- 描述:写入数据到目的表前,会先执行这里的标准语句。
- 必选:否
- 默认值:无
* **postSql**
- 描述:写入数据到目的表后,会执行这里的标准语句。
- 必选:否
- 默认值:无
* **maxBatchRows**
- 描述:每批次导入数据的最大行数。和 **maxBatchSize** 共同控制每批次的导入数量。每批次数据达到两个阈值之一,即开始导入这一批次的数据。
- 必选:否
- 默认值500000
* **batchSize**
- 描述:每批次导入数据的最大数据量。和 **maxBatchRows** 共同控制每批次的导入数量。每批次数据达到两个阈值之一,即开始导入这一批次的数据。
- 必选:否
- 默认值104857600
* **maxRetries**
- 描述:每批次导入数据失败后的重试次数。
- 必选:否
- 默认值0
* **labelPrefix**
- 描述:每批次导入任务的 label 前缀。最终的 label 将有 `labelPrefix + UUID` 组成全局唯一的 label确保数据不会重复导入
- 必选:否
- 默认值:`datax_doris_writer_`
* **loadProps**
- 描述StreamLoad 的请求参数详情参照StreamLoad介绍页面。[Stream load - Apache Doris](https://doris.apache.org/zh-CN/docs/data-operate/import/import-way/stream-load-manual)
这里包括导入的数据格式format等导入数据格式默认我们使用csv支持JSON具体可以参照下面类型转换部分也可以参照上面Stream load 官方信息
- 必选:否
- 默认值:无
### 类型转换
默认传入的数据均会被转为字符串,并以`\t`作为列分隔符,`\n`作为行分隔符,组成`csv`文件进行StreamLoad导入操作。
默认是csv格式导入如需更改列分隔符 则正确配置 `loadProps` 即可:
```json
"loadProps": {
"column_separator": "\\x01",
"row_delimiter": "\\x02"
}
```
如需更改导入格式为`json` 则正确配置 `loadProps` 即可:
```json
"loadProps": {
"format": "json",
"strip_outer_array": true
}
```
更多信息请参照 Doris 官网:[Stream load - Apache Doris](https://doris.apache.org/zh-CN/docs/data-operate/import/import-way/stream-load-manual)

View File

@ -0,0 +1,46 @@
{
"job": {
"content": [
{
"reader": {
"name": "mysqlreader",
"parameter": {
"column": ["k1", "k2", "k3"],
"connection": [
{
"jdbcUrl": ["jdbc:mysql://192.168.10.10:3306/db1"],
"table": ["t1"]
}
],
"username": "root",
"password": "",
"where": ""
}
},
"writer": {
"name": "doriswriter",
"parameter": {
"loadUrl": ["192.168.1.1:8030"],
"loadProps": {},
"database": "db1",
"column": ["k1", "k2", "k3"],
"username": "root",
"password": "",
"postSql": [],
"preSql": [],
"connection": [
"jdbcUrl":"jdbc:mysql://192.168.1.1:9030/",
"table":["xxx"],
"selectedDatabase":"xxxx"
]
}
}
}
],
"setting": {
"speed": {
"channel": "1"
}
}
}
}

99
doriswriter/pom.xml Normal file
View File

@ -0,0 +1,99 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
<project xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://maven.apache.org/POM/4.0.0"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>datax-all</artifactId>
<groupId>com.alibaba.datax</groupId>
<version>0.0.1-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>doriswriter</artifactId>
<name>doriswriter</name>
<packaging>jar</packaging>
<dependencies>
<dependency>
<groupId>com.alibaba.datax</groupId>
<artifactId>datax-common</artifactId>
<version>${datax-project-version}</version>
<exclusions>
<exclusion>
<artifactId>slf4j-log4j12</artifactId>
<groupId>org.slf4j</groupId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
</dependency>
<dependency>
<groupId>ch.qos.logback</groupId>
<artifactId>logback-classic</artifactId>
</dependency>
<dependency>
<groupId>com.alibaba.datax</groupId>
<artifactId>plugin-rdbms-util</artifactId>
<version>${datax-project-version}</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>${mysql.driver.version}</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.13</version>
</dependency>
</dependencies>
<build>
<plugins>
<!-- compiler plugin -->
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>${jdk-version}</source>
<target>${jdk-version}</target>
<encoding>${project-sourceEncoding}</encoding>
</configuration>
</plugin>
<!-- assembly plugin -->
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<descriptors>
<descriptor>src/main/assembly/package.xml</descriptor>
</descriptors>
<finalName>datax</finalName>
</configuration>
<executions>
<execution>
<id>dwzip</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>

View File

@ -0,0 +1,52 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
<assembly xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0"
xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0 http://maven.apache.org/xsd/assembly-1.1.0.xsd">
<id/>
<formats>
<format>dir</format>
</formats>
<includeBaseDirectory>false</includeBaseDirectory>
<fileSets>
<fileSet>
<directory>src/main/resources</directory>
<includes>
<include>plugin.json</include>
<include>plugin_job_template.json</include>
</includes>
<outputDirectory>plugin/writer/doriswriter</outputDirectory>
</fileSet>
<fileSet>
<directory>target/</directory>
<includes>
<include>doriswriter-0.0.1-SNAPSHOT.jar</include>
</includes>
<outputDirectory>plugin/writer/doriswriter</outputDirectory>
</fileSet>
</fileSets>
<dependencySets>
<dependencySet>
<useProjectArtifact>false</useProjectArtifact>
<outputDirectory>plugin/writer/doriswriter/libs</outputDirectory>
<scope>runtime</scope>
</dependencySet>
</dependencySets>
</assembly>

View File

@ -0,0 +1,54 @@
package com.alibaba.datax.plugin.writer.doriswriter;
import com.google.common.base.Strings;
import java.io.StringWriter;
public class DelimiterParser {
private static final String HEX_STRING = "0123456789ABCDEF";
public static String parse(String sp, String dSp) throws RuntimeException {
if ( Strings.isNullOrEmpty(sp)) {
return dSp;
}
if (!sp.toUpperCase().startsWith("\\X")) {
return sp;
}
String hexStr = sp.substring(2);
// check hex str
if (hexStr.isEmpty()) {
throw new RuntimeException("Failed to parse delimiter: `Hex str is empty`");
}
if (hexStr.length() % 2 != 0) {
throw new RuntimeException("Failed to parse delimiter: `Hex str length error`");
}
for (char hexChar : hexStr.toUpperCase().toCharArray()) {
if (HEX_STRING.indexOf(hexChar) == -1) {
throw new RuntimeException("Failed to parse delimiter: `Hex str format error`");
}
}
// transform to separator
StringWriter writer = new StringWriter();
for (byte b : hexStrToBytes(hexStr)) {
writer.append((char) b);
}
return writer.toString();
}
private static byte[] hexStrToBytes(String hexStr) {
String upperHexStr = hexStr.toUpperCase();
int length = upperHexStr.length() / 2;
char[] hexChars = upperHexStr.toCharArray();
byte[] bytes = new byte[length];
for (int i = 0; i < length; i++) {
int pos = i * 2;
bytes[i] = (byte) (charToByte(hexChars[pos]) << 4 | charToByte(hexChars[pos + 1]));
}
return bytes;
}
private static byte charToByte(char c) {
return (byte) HEX_STRING.indexOf(c);
}
}

View File

@ -0,0 +1,23 @@
package com.alibaba.datax.plugin.writer.doriswriter;
import com.alibaba.datax.common.element.Column;
public class DorisBaseCodec {
protected String convertionField( Column col) {
if (null == col.getRawData() || Column.Type.NULL == col.getType()) {
return null;
}
if ( Column.Type.BOOL == col.getType()) {
return String.valueOf(col.asLong());
}
if ( Column.Type.BYTES == col.getType()) {
byte[] bts = (byte[])col.getRawData();
long value = 0;
for (int i = 0; i < bts.length; i++) {
value += (bts[bts.length - i - 1] & 0xffL) << (8 * i);
}
return String.valueOf(value);
}
return col.asString();
}
}

View File

@ -0,0 +1,10 @@
package com.alibaba.datax.plugin.writer.doriswriter;
import com.alibaba.datax.common.element.Record;
import java.io.Serializable;
public interface DorisCodec extends Serializable {
String codec( Record row);
}

View File

@ -0,0 +1,19 @@
package com.alibaba.datax.plugin.writer.doriswriter;
import java.util.Map;
public class DorisCodecFactory {
public DorisCodecFactory (){
}
public static DorisCodec createCodec( Keys writerOptions) {
if ( Keys.StreamLoadFormat.CSV.equals(writerOptions.getStreamLoadFormat())) {
Map<String, Object> props = writerOptions.getLoadProps();
return new DorisCsvCodec (null == props || !props.containsKey("column_separator") ? null : String.valueOf(props.get("column_separator")));
}
if ( Keys.StreamLoadFormat.JSON.equals(writerOptions.getStreamLoadFormat())) {
return new DorisJsonCodec (writerOptions.getColumns());
}
throw new RuntimeException("Failed to create row serializer, unsupported `format` from stream load properties.");
}
}

View File

@ -0,0 +1,27 @@
package com.alibaba.datax.plugin.writer.doriswriter;
import com.alibaba.datax.common.element.Record;
public class DorisCsvCodec extends DorisBaseCodec implements DorisCodec {
private static final long serialVersionUID = 1L;
private final String columnSeparator;
public DorisCsvCodec ( String sp) {
this.columnSeparator = DelimiterParser.parse(sp, "\t");
}
@Override
public String codec( Record row) {
StringBuilder sb = new StringBuilder();
for (int i = 0; i < row.getColumnNumber(); i++) {
String value = convertionField(row.getColumn(i));
sb.append(null == value ? "\\N" : value);
if (i < row.getColumnNumber() - 1) {
sb.append(columnSeparator);
}
}
return sb.toString();
}
}

View File

@ -0,0 +1,33 @@
package com.alibaba.datax.plugin.writer.doriswriter;
import com.alibaba.datax.common.element.Record;
import com.alibaba.fastjson.JSON;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class DorisJsonCodec extends DorisBaseCodec implements DorisCodec {
private static final long serialVersionUID = 1L;
private final List<String> fieldNames;
public DorisJsonCodec ( List<String> fieldNames) {
this.fieldNames = fieldNames;
}
@Override
public String codec( Record row) {
if (null == fieldNames) {
return "";
}
Map<String, Object> rowMap = new HashMap<> (fieldNames.size());
int idx = 0;
for (String fieldName : fieldNames) {
rowMap.put(fieldName, convertionField(row.getColumn(idx)));
idx++;
}
return JSON.toJSONString(rowMap);
}
}

View File

@ -0,0 +1,233 @@
package com.alibaba.datax.plugin.writer.doriswriter;
import com.alibaba.fastjson.JSON;
import org.apache.commons.codec.binary.Base64;
import org.apache.http.HttpEntity;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPut;
import org.apache.http.entity.ByteArrayEntity;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.DefaultRedirectStrategy;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.URL;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;
public class DorisStreamLoadObserver {
private static final Logger LOG = LoggerFactory.getLogger(DorisStreamLoadObserver.class);
private Keys options;
private long pos;
private static final String RESULT_FAILED = "Fail";
private static final String RESULT_LABEL_EXISTED = "Label Already Exists";
private static final String LAEBL_STATE_VISIBLE = "VISIBLE";
private static final String LAEBL_STATE_COMMITTED = "COMMITTED";
private static final String RESULT_LABEL_PREPARE = "PREPARE";
private static final String RESULT_LABEL_ABORTED = "ABORTED";
private static final String RESULT_LABEL_UNKNOWN = "UNKNOWN";
public DorisStreamLoadObserver ( Keys options){
this.options = options;
}
public void streamLoad(WriterTuple data) throws Exception {
String host = getLoadHost();
if(host == null){
throw new IOException ("load_url cannot be empty, or the host cannot connect.Please check your configuration.");
}
String loadUrl = new StringBuilder(host)
.append("/api/")
.append(options.getDatabase())
.append("/")
.append(options.getTable())
.append("/_stream_load")
.toString();
LOG.info("Start to join batch data: rows[{}] bytes[{}] label[{}].", data.getRows().size(), data.getBytes(), data.getLabel());
Map<String, Object> loadResult = put(loadUrl, data.getLabel(), addRows(data.getRows(), data.getBytes().intValue()));
LOG.info("StreamLoad response :{}",JSON.toJSONString(loadResult));
final String keyStatus = "Status";
if (null == loadResult || !loadResult.containsKey(keyStatus)) {
throw new IOException("Unable to flush data to Doris: unknown result status.");
}
LOG.debug("StreamLoad response:{}",JSON.toJSONString(loadResult));
if (RESULT_FAILED.equals(loadResult.get(keyStatus))) {
throw new IOException(
new StringBuilder("Failed to flush data to Doris.\n").append(JSON.toJSONString(loadResult)).toString()
);
} else if (RESULT_LABEL_EXISTED.equals(loadResult.get(keyStatus))) {
LOG.debug("StreamLoad response:{}",JSON.toJSONString(loadResult));
checkStreamLoadState(host, data.getLabel());
}
}
private void checkStreamLoadState(String host, String label) throws IOException {
int idx = 0;
while(true) {
try {
TimeUnit.SECONDS.sleep(Math.min(++idx, 5));
} catch (InterruptedException ex) {
break;
}
try (CloseableHttpClient httpclient = HttpClients.createDefault()) {
HttpGet httpGet = new HttpGet(new StringBuilder(host).append("/api/").append(options.getDatabase()).append("/get_load_state?label=").append(label).toString());
httpGet.setHeader("Authorization", getBasicAuthHeader(options.getUsername(), options.getPassword()));
httpGet.setHeader("Connection", "close");
try (CloseableHttpResponse resp = httpclient.execute(httpGet)) {
HttpEntity respEntity = getHttpEntity(resp);
if (respEntity == null) {
throw new IOException(String.format("Failed to flush data to Doris, Error " +
"could not get the final state of label[%s].\n", label), null);
}
Map<String, Object> result = (Map<String, Object>)JSON.parse(EntityUtils.toString(respEntity));
String labelState = (String)result.get("state");
if (null == labelState) {
throw new IOException(String.format("Failed to flush data to Doris, Error " +
"could not get the final state of label[%s]. response[%s]\n", label, EntityUtils.toString(respEntity)), null);
}
LOG.info(String.format("Checking label[%s] state[%s]\n", label, labelState));
switch(labelState) {
case LAEBL_STATE_VISIBLE:
case LAEBL_STATE_COMMITTED:
return;
case RESULT_LABEL_PREPARE:
continue;
case RESULT_LABEL_ABORTED:
throw new DorisWriterExcetion (String.format("Failed to flush data to Doris, Error " +
"label[%s] state[%s]\n", label, labelState), null, true);
case RESULT_LABEL_UNKNOWN:
default:
throw new IOException(String.format("Failed to flush data to Doris, Error " +
"label[%s] state[%s]\n", label, labelState), null);
}
}
}
}
}
private byte[] addRows(List<byte[]> rows, int totalBytes) {
if (Keys.StreamLoadFormat.CSV.equals(options.getStreamLoadFormat())) {
Map<String, Object> props = (options.getLoadProps() == null ? new HashMap<> () : options.getLoadProps());
byte[] lineDelimiter = DelimiterParser.parse((String)props.get("row_delimiter"), "\n").getBytes(StandardCharsets.UTF_8);
ByteBuffer bos = ByteBuffer.allocate(totalBytes + rows.size() * lineDelimiter.length);
for (byte[] row : rows) {
bos.put(row);
bos.put(lineDelimiter);
}
return bos.array();
}
if (Keys.StreamLoadFormat.JSON.equals(options.getStreamLoadFormat())) {
ByteBuffer bos = ByteBuffer.allocate(totalBytes + (rows.isEmpty() ? 2 : rows.size() + 1));
bos.put("[".getBytes(StandardCharsets.UTF_8));
byte[] jsonDelimiter = ",".getBytes(StandardCharsets.UTF_8);
boolean isFirstElement = true;
for (byte[] row : rows) {
if (!isFirstElement) {
bos.put(jsonDelimiter);
}
bos.put(row);
isFirstElement = false;
}
bos.put("]".getBytes(StandardCharsets.UTF_8));
return bos.array();
}
throw new RuntimeException("Failed to join rows data, unsupported `format` from stream load properties:");
}
private Map<String, Object> put(String loadUrl, String label, byte[] data) throws IOException {
LOG.info(String.format("Executing stream load to: '%s', size: '%s'", loadUrl, data.length));
final HttpClientBuilder httpClientBuilder = HttpClients.custom()
.setRedirectStrategy(new DefaultRedirectStrategy () {
@Override
protected boolean isRedirectable(String method) {
return true;
}
});
try ( CloseableHttpClient httpclient = httpClientBuilder.build()) {
HttpPut httpPut = new HttpPut(loadUrl);
List<String> cols = options.getColumns();
if (null != cols && !cols.isEmpty() && Keys.StreamLoadFormat.CSV.equals(options.getStreamLoadFormat())) {
httpPut.setHeader("columns", String.join(",", cols.stream().map(f -> String.format("`%s`", f)).collect(Collectors.toList())));
}
if (null != options.getLoadProps()) {
for (Map.Entry<String, Object> entry : options.getLoadProps().entrySet()) {
httpPut.setHeader(entry.getKey(), String.valueOf(entry.getValue()));
}
}
httpPut.setHeader("Expect", "100-continue");
httpPut.setHeader("label", label);
httpPut.setHeader("Content-Type", "application/x-www-form-urlencoded");
httpPut.setHeader("Authorization", getBasicAuthHeader(options.getUsername(), options.getPassword()));
httpPut.setEntity(new ByteArrayEntity (data));
httpPut.setConfig(RequestConfig.custom().setRedirectsEnabled(true).build());
try ( CloseableHttpResponse resp = httpclient.execute(httpPut)) {
HttpEntity respEntity = getHttpEntity(resp);
if (respEntity == null)
return null;
return (Map<String, Object>)JSON.parse(EntityUtils.toString(respEntity));
}
}
}
private String getBasicAuthHeader(String username, String password) {
String auth = username + ":" + password;
byte[] encodedAuth = Base64.encodeBase64(auth.getBytes(StandardCharsets.UTF_8));
return new StringBuilder("Basic ").append(new String(encodedAuth)).toString();
}
private HttpEntity getHttpEntity(CloseableHttpResponse resp) {
int code = resp.getStatusLine().getStatusCode();
if (200 != code) {
LOG.warn("Request failed with code:{}", code);
return null;
}
HttpEntity respEntity = resp.getEntity();
if (null == respEntity) {
LOG.warn("Request failed with empty response.");
return null;
}
return respEntity;
}
private String getLoadHost() {
List<String> hostList = options.getLoadUrlList();
long tmp = pos + hostList.size();
for (; pos < tmp; pos++) {
String host = new StringBuilder("http://").append(hostList.get((int) (pos % hostList.size()))).toString();
if (checkConnection(host)) {
return host;
}
}
return null;
}
private boolean checkConnection(String host) {
try {
URL url = new URL(host);
HttpURLConnection co = (HttpURLConnection) url.openConnection();
co.setConnectTimeout(5000);
co.connect();
co.disconnect();
return true;
} catch (Exception e1) {
e1.printStackTrace();
return false;
}
}
}

View File

@ -0,0 +1,105 @@
package com.alibaba.datax.plugin.writer.doriswriter;
import com.alibaba.datax.plugin.rdbms.util.DBUtil;
import com.alibaba.datax.plugin.rdbms.util.DataBaseType;
import com.alibaba.datax.plugin.rdbms.util.RdbmsException;
import com.alibaba.datax.plugin.rdbms.writer.Constant;
import com.alibaba.druid.sql.parser.ParserException;
import com.google.common.base.Strings;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
/**
* jdbc util
*/
public class DorisUtil {
private static final Logger LOG = LoggerFactory.getLogger(DorisUtil.class);
private DorisUtil() {}
public static List<String> getDorisTableColumns( Connection conn, String databaseName, String tableName) {
String currentSql = String.format("SELECT COLUMN_NAME FROM `information_schema`.`COLUMNS` WHERE `TABLE_SCHEMA` = '%s' AND `TABLE_NAME` = '%s' ORDER BY `ORDINAL_POSITION` ASC;", databaseName, tableName);
List<String> columns = new ArrayList<> ();
ResultSet rs = null;
try {
rs = DBUtil.query(conn, currentSql);
while (DBUtil.asyncResultSetNext(rs)) {
String colName = rs.getString("COLUMN_NAME");
columns.add(colName);
}
return columns;
} catch (Exception e) {
throw RdbmsException.asQueryException(DataBaseType.MySql, e, currentSql, null, null);
} finally {
DBUtil.closeDBResources(rs, null, null);
}
}
public static List<String> renderPreOrPostSqls(List<String> preOrPostSqls, String tableName) {
if (null == preOrPostSqls) {
return Collections.emptyList();
}
List<String> renderedSqls = new ArrayList<>();
for (String sql : preOrPostSqls) {
if (! Strings.isNullOrEmpty(sql)) {
renderedSqls.add(sql.replace(Constant.TABLE_NAME_PLACEHOLDER, tableName));
}
}
return renderedSqls;
}
public static void executeSqls(Connection conn, List<String> sqls) {
Statement stmt = null;
String currentSql = null;
try {
stmt = conn.createStatement();
for (String sql : sqls) {
currentSql = sql;
DBUtil.executeSqlWithoutResultSet(stmt, sql);
}
} catch (Exception e) {
throw RdbmsException.asQueryException(DataBaseType.MySql, e, currentSql, null, null);
} finally {
DBUtil.closeDBResources(null, stmt, null);
}
}
public static void preCheckPrePareSQL( Keys options) {
String table = options.getTable();
List<String> preSqls = options.getPreSqlList();
List<String> renderedPreSqls = DorisUtil.renderPreOrPostSqls(preSqls, table);
if (null != renderedPreSqls && !renderedPreSqls.isEmpty()) {
LOG.info("Begin to preCheck preSqls:[{}].", String.join(";", renderedPreSqls));
for (String sql : renderedPreSqls) {
try {
DBUtil.sqlValid(sql, DataBaseType.MySql);
} catch ( ParserException e) {
throw RdbmsException.asPreSQLParserException(DataBaseType.MySql,e,sql);
}
}
}
}
public static void preCheckPostSQL( Keys options) {
String table = options.getTable();
List<String> postSqls = options.getPostSqlList();
List<String> renderedPostSqls = DorisUtil.renderPreOrPostSqls(postSqls, table);
if (null != renderedPostSqls && !renderedPostSqls.isEmpty()) {
LOG.info("Begin to preCheck postSqls:[{}].", String.join(";", renderedPostSqls));
for(String sql : renderedPostSqls) {
try {
DBUtil.sqlValid(sql, DataBaseType.MySql);
} catch (ParserException e){
throw RdbmsException.asPostSQLParserException(DataBaseType.MySql,e,sql);
}
}
}
}
}

View File

@ -0,0 +1,164 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
package com.alibaba.datax.plugin.writer.doriswriter;
import com.alibaba.datax.common.element.Record;
import com.alibaba.datax.common.exception.DataXException;
import com.alibaba.datax.common.plugin.RecordReceiver;
import com.alibaba.datax.common.spi.Writer;
import com.alibaba.datax.common.util.Configuration;
import com.alibaba.datax.plugin.rdbms.util.DBUtil;
import com.alibaba.datax.plugin.rdbms.util.DBUtilErrorCode;
import com.alibaba.datax.plugin.rdbms.util.DataBaseType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.sql.Connection;
import java.util.ArrayList;
import java.util.List;
/**
* doris data writer
*/
public class DorisWriter extends Writer {
public static class Job extends Writer.Job {
private static final Logger LOG = LoggerFactory.getLogger(Job.class);
private Configuration originalConfig = null;
private Keys options;
@Override
public void init() {
this.originalConfig = super.getPluginJobConf();
options = new Keys (super.getPluginJobConf());
options.doPretreatment();
}
@Override
public void preCheck(){
this.init();
DorisUtil.preCheckPrePareSQL(options);
DorisUtil.preCheckPostSQL(options);
}
@Override
public void prepare() {
String username = options.getUsername();
String password = options.getPassword();
String jdbcUrl = options.getJdbcUrl();
List<String> renderedPreSqls = DorisUtil.renderPreOrPostSqls(options.getPreSqlList(), options.getTable());
if (null != renderedPreSqls && !renderedPreSqls.isEmpty()) {
Connection conn = DBUtil.getConnection(DataBaseType.MySql, jdbcUrl, username, password);
LOG.info("Begin to execute preSqls:[{}]. context info:{}.", String.join(";", renderedPreSqls), jdbcUrl);
DorisUtil.executeSqls(conn, renderedPreSqls);
DBUtil.closeDBResources(null, null, conn);
}
}
@Override
public List<Configuration> split(int mandatoryNumber) {
List<Configuration> configurations = new ArrayList<>(mandatoryNumber);
for (int i = 0; i < mandatoryNumber; i++) {
configurations.add(originalConfig);
}
return configurations;
}
@Override
public void post() {
String username = options.getUsername();
String password = options.getPassword();
String jdbcUrl = options.getJdbcUrl();
List<String> renderedPostSqls = DorisUtil.renderPreOrPostSqls(options.getPostSqlList(), options.getTable());
if (null != renderedPostSqls && !renderedPostSqls.isEmpty()) {
Connection conn = DBUtil.getConnection(DataBaseType.MySql, jdbcUrl, username, password);
LOG.info("Start to execute preSqls:[{}]. context info:{}.", String.join(";", renderedPostSqls), jdbcUrl);
DorisUtil.executeSqls(conn, renderedPostSqls);
DBUtil.closeDBResources(null, null, conn);
}
}
@Override
public void destroy() {
}
}
public static class Task extends Writer.Task {
private DorisWriterManager writerManager;
private Keys options;
private DorisCodec rowCodec;
@Override
public void init() {
options = new Keys (super.getPluginJobConf());
if (options.isWildcardColumn()) {
Connection conn = DBUtil.getConnection(DataBaseType.MySql, options.getJdbcUrl(), options.getUsername(), options.getPassword());
List<String> columns = DorisUtil.getDorisTableColumns(conn, options.getDatabase(), options.getTable());
options.setInfoCchemaColumns(columns);
}
writerManager = new DorisWriterManager(options);
rowCodec = DorisCodecFactory.createCodec(options);
}
@Override
public void prepare() {
}
public void startWrite(RecordReceiver recordReceiver) {
try {
Record record;
while ((record = recordReceiver.getFromReader()) != null) {
if (record.getColumnNumber() != options.getColumns().size()) {
throw DataXException
.asDataXException(
DBUtilErrorCode.CONF_ERROR,
String.format(
"There is an error in the column configuration information. " +
"This is because you have configured a task where the number of fields to be read from the source:%s " +
"is not equal to the number of fields to be written to the destination table:%s. " +
"Please check your configuration and make changes.",
record.getColumnNumber(),
options.getColumns().size()));
}
writerManager.writeRecord(rowCodec.codec(record));
}
} catch (Exception e) {
throw DataXException.asDataXException(DBUtilErrorCode.WRITE_DATA_ERROR, e);
}
}
@Override
public void post() {
try {
writerManager.close();
} catch (Exception e) {
throw DataXException.asDataXException(DBUtilErrorCode.WRITE_DATA_ERROR, e);
}
}
@Override
public void destroy() {}
@Override
public boolean supportFailOver(){
return false;
}
}
}

View File

@ -0,0 +1,29 @@
package com.alibaba.datax.plugin.writer.doriswriter;
import java.io.IOException;
import java.util.Map;
public class DorisWriterExcetion extends IOException {
private final Map<String, Object> response;
private boolean reCreateLabel;
public DorisWriterExcetion ( String message, Map<String, Object> response) {
super(message);
this.response = response;
}
public DorisWriterExcetion ( String message, Map<String, Object> response, boolean reCreateLabel) {
super(message);
this.response = response;
this.reCreateLabel = reCreateLabel;
}
public Map<String, Object> getFailedResponse() {
return response;
}
public boolean needReCreateLabel() {
return reCreateLabel;
}
}

View File

@ -0,0 +1,192 @@
package com.alibaba.datax.plugin.writer.doriswriter;
import com.google.common.base.Strings;
import org.apache.commons.lang3.concurrent.BasicThreadFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import java.util.UUID;
import java.util.concurrent.Executors;
import java.util.concurrent.LinkedBlockingDeque;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.ScheduledFuture;
import java.util.concurrent.TimeUnit;
public class DorisWriterManager {
private static final Logger LOG = LoggerFactory.getLogger(DorisWriterManager.class);
private final DorisStreamLoadObserver visitor;
private final Keys options;
private final List<byte[]> buffer = new ArrayList<> ();
private int batchCount = 0;
private long batchSize = 0;
private volatile boolean closed = false;
private volatile Exception flushException;
private final LinkedBlockingDeque< WriterTuple > flushQueue;
private ScheduledExecutorService scheduler;
private ScheduledFuture<?> scheduledFuture;
public DorisWriterManager( Keys options) {
this.options = options;
this.visitor = new DorisStreamLoadObserver (options);
flushQueue = new LinkedBlockingDeque<>(options.getFlushQueueLength());
this.startScheduler();
this.startAsyncFlushing();
}
public void startScheduler() {
stopScheduler();
this.scheduler = Executors.newScheduledThreadPool(1, new BasicThreadFactory.Builder().namingPattern("Doris-interval-flush").daemon(true).build());
this.scheduledFuture = this.scheduler.schedule(() -> {
synchronized (DorisWriterManager.this) {
if (!closed) {
try {
String label = createBatchLabel();
LOG.info(String.format("Doris interval Sinking triggered: label[%s].", label));
if (batchCount == 0) {
startScheduler();
}
flush(label, false);
} catch (Exception e) {
flushException = e;
}
}
}
}, options.getFlushInterval(), TimeUnit.MILLISECONDS);
}
public void stopScheduler() {
if (this.scheduledFuture != null) {
scheduledFuture.cancel(false);
this.scheduler.shutdown();
}
}
public final synchronized void writeRecord(String record) throws IOException {
checkFlushException();
try {
byte[] bts = record.getBytes(StandardCharsets.UTF_8);
buffer.add(bts);
batchCount++;
batchSize += bts.length;
if (batchCount >= options.getBatchRows() || batchSize >= options.getBatchSize()) {
String label = createBatchLabel();
LOG.debug(String.format("Doris buffer Sinking triggered: rows[%d] label[%s].", batchCount, label));
flush(label, false);
}
} catch (Exception e) {
throw new IOException("Writing records to Doris failed.", e);
}
}
public synchronized void flush(String label, boolean waitUtilDone) throws Exception {
checkFlushException();
if (batchCount == 0) {
if (waitUtilDone) {
waitAsyncFlushingDone();
}
return;
}
flushQueue.put(new WriterTuple (label, batchSize, new ArrayList<>(buffer)));
if (waitUtilDone) {
// wait the last flush
waitAsyncFlushingDone();
}
buffer.clear();
batchCount = 0;
batchSize = 0;
}
public synchronized void close() {
if (!closed) {
closed = true;
try {
String label = createBatchLabel();
if (batchCount > 0) LOG.debug(String.format("Doris Sink is about to close: label[%s].", label));
flush(label, true);
} catch (Exception e) {
throw new RuntimeException("Writing records to Doris failed.", e);
}
}
checkFlushException();
}
public String createBatchLabel() {
StringBuilder sb = new StringBuilder();
if (! Strings.isNullOrEmpty(options.getLabelPrefix())) {
sb.append(options.getLabelPrefix());
}
return sb.append(UUID.randomUUID().toString())
.toString();
}
private void startAsyncFlushing() {
// start flush thread
Thread flushThread = new Thread(new Runnable(){
public void run() {
while(true) {
try {
asyncFlush();
} catch (Exception e) {
flushException = e;
}
}
}
});
flushThread.setDaemon(true);
flushThread.start();
}
private void waitAsyncFlushingDone() throws InterruptedException {
// wait previous flushings
for (int i = 0; i <= options.getFlushQueueLength(); i++) {
flushQueue.put(new WriterTuple ("", 0l, null));
}
checkFlushException();
}
private void asyncFlush() throws Exception {
WriterTuple flushData = flushQueue.take();
if (Strings.isNullOrEmpty(flushData.getLabel())) {
return;
}
stopScheduler();
LOG.debug(String.format("Async stream load: rows[%d] bytes[%d] label[%s].", flushData.getRows().size(), flushData.getBytes(), flushData.getLabel()));
for (int i = 0; i <= options.getMaxRetries(); i++) {
try {
// flush to Doris with stream load
visitor.streamLoad(flushData);
LOG.info(String.format("Async stream load finished: label[%s].", flushData.getLabel()));
startScheduler();
break;
} catch (Exception e) {
LOG.warn("Failed to flush batch data to Doris, retry times = {}", i, e);
if (i >= options.getMaxRetries()) {
throw new IOException(e);
}
if (e instanceof DorisWriterExcetion && (( DorisWriterExcetion )e).needReCreateLabel()) {
String newLabel = createBatchLabel();
LOG.warn(String.format("Batch label changed from [%s] to [%s]", flushData.getLabel(), newLabel));
flushData.setLabel(newLabel);
}
try {
Thread.sleep(1000l * Math.min(i + 1, 10));
} catch (InterruptedException ex) {
Thread.currentThread().interrupt();
throw new IOException("Unable to flush, interrupted while doing another attempt", e);
}
}
}
}
private void checkFlushException() {
if (flushException != null) {
throw new RuntimeException("Writing records to Doris failed.", flushException);
}
}
}

View File

@ -0,0 +1,177 @@
package com.alibaba.datax.plugin.writer.doriswriter;
import com.alibaba.datax.common.exception.DataXException;
import com.alibaba.datax.common.util.Configuration;
import com.alibaba.datax.plugin.rdbms.util.DBUtilErrorCode;
import java.io.Serializable;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
public class Keys implements Serializable {
private static final long serialVersionUID = 1l;
private static final int MAX_RETRIES = 3;
private static final int BATCH_ROWS = 500000;
private static final long DEFAULT_FLUSH_INTERVAL = 30000;
private static final String LOAD_PROPS_FORMAT = "format";
public enum StreamLoadFormat {
CSV, JSON;
}
private static final String USERNAME = "username";
private static final String PASSWORD = "password";
private static final String DATABASE = "connection[0].selectedDatabase";
private static final String TABLE = "connection[0].table[0]";
private static final String COLUMN = "column";
private static final String PRE_SQL = "preSql";
private static final String POST_SQL = "postSql";
private static final String JDBC_URL = "connection[0].jdbcUrl";
private static final String LABEL_PREFIX = "labelPrefix";
private static final String MAX_BATCH_ROWS = "maxBatchRows";
private static final String MAX_BATCH_SIZE = "batchSize";
private static final String FLUSH_INTERVAL = "flushInterval";
private static final String LOAD_URL = "loadUrl";
private static final String FLUSH_QUEUE_LENGTH = "flushQueueLength";
private static final String LOAD_PROPS = "loadProps";
private static final String DEFAULT_LABEL_PREFIX = "datax_doris_writer_";
private static final long DEFAULT_MAX_BATCH_SIZE = 90 * 1024 * 1024; //default 90M
private final Configuration options;
private List<String> infoSchemaColumns;
private List<String> userSetColumns;
private boolean isWildcardColumn;
public Keys ( Configuration options) {
this.options = options;
this.userSetColumns = options.getList(COLUMN, String.class).stream().map(str -> str.replace("`", "")).collect(Collectors.toList());
if (1 == options.getList(COLUMN, String.class).size() && "*".trim().equals(options.getList(COLUMN, String.class).get(0))) {
this.isWildcardColumn = true;
}
}
public void doPretreatment() {
validateRequired();
validateStreamLoadUrl();
}
public String getJdbcUrl() {
return options.getString(JDBC_URL);
}
public String getDatabase() {
return options.getString(DATABASE);
}
public String getTable() {
return options.getString(TABLE);
}
public String getUsername() {
return options.getString(USERNAME);
}
public String getPassword() {
return options.getString(PASSWORD);
}
public String getLabelPrefix() {
String label = options.getString(LABEL_PREFIX);
return null == label ? DEFAULT_LABEL_PREFIX : label;
}
public List<String> getLoadUrlList() {
return options.getList(LOAD_URL, String.class);
}
public List<String> getColumns() {
if (isWildcardColumn) {
return this.infoSchemaColumns;
}
return this.userSetColumns;
}
public boolean isWildcardColumn() {
return this.isWildcardColumn;
}
public void setInfoCchemaColumns(List<String> cols) {
this.infoSchemaColumns = cols;
}
public List<String> getPreSqlList() {
return options.getList(PRE_SQL, String.class);
}
public List<String> getPostSqlList() {
return options.getList(POST_SQL, String.class);
}
public Map<String, Object> getLoadProps() {
return options.getMap(LOAD_PROPS);
}
public int getMaxRetries() {
return MAX_RETRIES;
}
public int getBatchRows() {
Integer rows = options.getInt(MAX_BATCH_ROWS);
return null == rows ? BATCH_ROWS : rows;
}
public long getBatchSize() {
Long size = options.getLong(MAX_BATCH_SIZE);
return null == size ? DEFAULT_MAX_BATCH_SIZE : size;
}
public long getFlushInterval() {
Long interval = options.getLong(FLUSH_INTERVAL);
return null == interval ? DEFAULT_FLUSH_INTERVAL : interval;
}
public int getFlushQueueLength() {
Integer len = options.getInt(FLUSH_QUEUE_LENGTH);
return null == len ? 1 : len;
}
public StreamLoadFormat getStreamLoadFormat() {
Map<String, Object> loadProps = getLoadProps();
if (null == loadProps) {
return StreamLoadFormat.CSV;
}
if (loadProps.containsKey(LOAD_PROPS_FORMAT)
&& StreamLoadFormat.JSON.name().equalsIgnoreCase(String.valueOf(loadProps.get(LOAD_PROPS_FORMAT)))) {
return StreamLoadFormat.JSON;
}
return StreamLoadFormat.CSV;
}
private void validateStreamLoadUrl() {
List<String> urlList = getLoadUrlList();
for (String host : urlList) {
if (host.split(":").length < 2) {
throw DataXException.asDataXException(DBUtilErrorCode.CONF_ERROR,
"The format of loadUrl is not correct, please enter:[`fe_ip:fe_http_ip;fe_ip:fe_http_ip`].");
}
}
}
private void validateRequired() {
final String[] requiredOptionKeys = new String[]{
USERNAME,
DATABASE,
TABLE,
COLUMN,
LOAD_URL
};
for (String optionKey : requiredOptionKeys) {
options.getNecessaryValue(optionKey, DBUtilErrorCode.REQUIRED_VALUE);
}
}
}

View File

@ -0,0 +1,20 @@
package com.alibaba.datax.plugin.writer.doriswriter;
import java.util.List;
public class WriterTuple {
private String label;
private Long bytes;
private List<byte[]> rows;
public WriterTuple ( String label, Long bytes, List<byte[]> rows){
this.label = label;
this.rows = rows;
this.bytes = bytes;
}
public String getLabel() { return label; }
public void setLabel(String label) { this.label = label; }
public Long getBytes() { return bytes; }
public List<byte[]> getRows() { return rows; }
}

View File

@ -0,0 +1,6 @@
{
"name": "doriswriter",
"class": "com.alibaba.datax.plugin.writer.doriswriter.DorisWriter",
"description": "apache doris writer plugin",
"developer": "apche doris"
}

View File

@ -0,0 +1,20 @@
{
"name": "doriswriter",
"parameter": {
"username": "",
"password": "",
"column": [],
"preSql": [],
"postSql": [],
"beLoadUrl": [],
"loadUrl": [],
"loadProps": {},
"connection": [
{
"jdbcUrl": "",
"selectedDatabase": "",
"table": []
}
]
}
}

View File

@ -35,12 +35,12 @@
<dependency> <dependency>
<groupId>io.searchbox</groupId> <groupId>io.searchbox</groupId>
<artifactId>jest-common</artifactId> <artifactId>jest-common</artifactId>
<version>2.4.0</version> <version>6.3.1</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>io.searchbox</groupId> <groupId>io.searchbox</groupId>
<artifactId>jest</artifactId> <artifactId>jest</artifactId>
<version>2.4.0</version> <version>6.3.1</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>joda-time</groupId> <groupId>joda-time</groupId>

View File

@ -1,236 +0,0 @@
package com.alibaba.datax.plugin.writer.elasticsearchwriter;
import com.google.gson.Gson;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import io.searchbox.action.Action;
import io.searchbox.client.JestClient;
import io.searchbox.client.JestClientFactory;
import io.searchbox.client.JestResult;
import io.searchbox.client.config.HttpClientConfig;
import io.searchbox.client.config.HttpClientConfig.Builder;
import io.searchbox.core.Bulk;
import io.searchbox.indices.CreateIndex;
import io.searchbox.indices.DeleteIndex;
import io.searchbox.indices.IndicesExists;
import io.searchbox.indices.aliases.*;
import io.searchbox.indices.mapping.PutMapping;
import org.apache.http.HttpHost;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
/**
* Created by xiongfeng.bxf on 17/2/8.
*/
public class ESClient {
private static final Logger log = LoggerFactory.getLogger(ESClient.class);
private JestClient jestClient;
public JestClient getClient() {
return jestClient;
}
public void createClient(String endpoint,
String user,
String passwd,
boolean multiThread,
int readTimeout,
boolean compression,
boolean discovery) {
JestClientFactory factory = new JestClientFactory();
Builder httpClientConfig = new HttpClientConfig
.Builder(endpoint)
.setPreemptiveAuth(new HttpHost(endpoint))
.multiThreaded(multiThread)
.connTimeout(30000)
.readTimeout(readTimeout)
.maxTotalConnection(200)
.requestCompressionEnabled(compression)
.discoveryEnabled(discovery)
.discoveryFrequency(5l, TimeUnit.MINUTES);
if (!("".equals(user) || "".equals(passwd))) {
httpClientConfig.defaultCredentials(user, passwd);
}
factory.setHttpClientConfig(httpClientConfig.build());
jestClient = factory.getObject();
}
public boolean indicesExists(String indexName) throws Exception {
boolean isIndicesExists = false;
JestResult rst = jestClient.execute(new IndicesExists.Builder(indexName).build());
if (rst.isSucceeded()) {
isIndicesExists = true;
} else {
switch (rst.getResponseCode()) {
case 404:
isIndicesExists = false;
break;
case 401:
// 无权访问
default:
log.warn(rst.getErrorMessage());
break;
}
}
return isIndicesExists;
}
public boolean deleteIndex(String indexName) throws Exception {
log.info("delete index " + indexName);
if (indicesExists(indexName)) {
JestResult rst = execute(new DeleteIndex.Builder(indexName).build());
if (!rst.isSucceeded()) {
return false;
}
} else {
log.info("index cannot found, skip delete " + indexName);
}
return true;
}
public boolean createIndex(String indexName, String typeName,
Object mappings, String settings, boolean dynamic) throws Exception {
JestResult rst = null;
if (!indicesExists(indexName)) {
log.info("create index " + indexName);
rst = jestClient.execute(
new CreateIndex.Builder(indexName)
.settings(settings)
.setParameter("master_timeout", "5m")
.build()
);
//index_already_exists_exception
if (!rst.isSucceeded()) {
if (getStatus(rst) == 400) {
log.info(String.format("index [%s] already exists", indexName));
return true;
} else {
log.error(rst.getErrorMessage());
return false;
}
} else {
log.info(String.format("create [%s] index success", indexName));
}
}
int idx = 0;
while (idx < 5) {
if (indicesExists(indexName)) {
break;
}
Thread.sleep(2000);
idx ++;
}
if (idx >= 5) {
return false;
}
if (dynamic) {
log.info("ignore mappings");
return true;
}
log.info("create mappings for " + indexName + " " + mappings);
rst = jestClient.execute(new PutMapping.Builder(indexName, typeName, mappings)
.setParameter("master_timeout", "5m").build());
if (!rst.isSucceeded()) {
if (getStatus(rst) == 400) {
log.info(String.format("index [%s] mappings already exists", indexName));
} else {
log.error(rst.getErrorMessage());
return false;
}
} else {
log.info(String.format("index [%s] put mappings success", indexName));
}
return true;
}
public JestResult execute(Action<JestResult> clientRequest) throws Exception {
JestResult rst = null;
rst = jestClient.execute(clientRequest);
if (!rst.isSucceeded()) {
//log.warn(rst.getErrorMessage());
}
return rst;
}
public Integer getStatus(JestResult rst) {
JsonObject jsonObject = rst.getJsonObject();
if (jsonObject.has("status")) {
return jsonObject.get("status").getAsInt();
}
return 600;
}
public boolean isBulkResult(JestResult rst) {
JsonObject jsonObject = rst.getJsonObject();
return jsonObject.has("items");
}
public boolean alias(String indexname, String aliasname, boolean needClean) throws IOException {
GetAliases getAliases = new GetAliases.Builder().addIndex(aliasname).build();
AliasMapping addAliasMapping = new AddAliasMapping.Builder(indexname, aliasname).build();
JestResult rst = jestClient.execute(getAliases);
log.info(rst.getJsonString());
List<AliasMapping> list = new ArrayList<AliasMapping>();
if (rst.isSucceeded()) {
JsonParser jp = new JsonParser();
JsonObject jo = (JsonObject)jp.parse(rst.getJsonString());
for(Map.Entry<String, JsonElement> entry : jo.entrySet()){
String tindex = entry.getKey();
if (indexname.equals(tindex)) {
continue;
}
AliasMapping m = new RemoveAliasMapping.Builder(tindex, aliasname).build();
String s = new Gson().toJson(m.getData());
log.info(s);
if (needClean) {
list.add(m);
}
}
}
ModifyAliases modifyAliases = new ModifyAliases.Builder(addAliasMapping).addAlias(list).setParameter("master_timeout", "5m").build();
rst = jestClient.execute(modifyAliases);
if (!rst.isSucceeded()) {
log.error(rst.getErrorMessage());
return false;
}
return true;
}
public JestResult bulkInsert(Bulk.Builder bulk, int trySize) throws Exception {
// es_rejected_execution_exception
// illegal_argument_exception
// cluster_block_exception
JestResult rst = null;
rst = jestClient.execute(bulk.build());
if (!rst.isSucceeded()) {
log.warn(rst.getErrorMessage());
}
return rst;
}
/**
* 关闭JestClient客户端
*
*/
public void closeJestClient() {
if (jestClient != null) {
jestClient.shutdownClient();
}
}
}

View File

@ -1,65 +0,0 @@
package com.alibaba.datax.plugin.writer.elasticsearchwriter;
/**
* Created by xiongfeng.bxf on 17/3/2.
*/
public class ESColumn {
private String name;//: "appkey",
private String type;//": "TEXT",
private String timezone;
private String format;
private Boolean array;
public void setName(String name) {
this.name = name;
}
public void setType(String type) {
this.type = type;
}
public void setTimeZone(String timezone) {
this.timezone = timezone;
}
public void setFormat(String format) {
this.format = format;
}
public String getName() {
return name;
}
public String getType() {
return type;
}
public String getTimezone() {
return timezone;
}
public String getFormat() {
return format;
}
public void setTimezone(String timezone) {
this.timezone = timezone;
}
public Boolean isArray() {
return array;
}
public void setArray(Boolean array) {
this.array = array;
}
public Boolean getArray() {
return array;
}
}

View File

@ -1,460 +0,0 @@
package com.alibaba.datax.plugin.writer.elasticsearchwriter;
import com.alibaba.datax.common.element.Column;
import com.alibaba.datax.common.element.Record;
import com.alibaba.datax.common.exception.DataXException;
import com.alibaba.datax.common.plugin.RecordReceiver;
import com.alibaba.datax.common.spi.Writer;
import com.alibaba.datax.common.util.Configuration;
import com.alibaba.datax.common.util.RetryUtil;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import com.alibaba.fastjson.TypeReference;
import io.searchbox.client.JestResult;
import io.searchbox.core.Bulk;
import io.searchbox.core.BulkResult;
import io.searchbox.core.Index;
import org.joda.time.DateTime;
import org.joda.time.DateTimeZone;
import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.net.URLEncoder;
import java.util.*;
import java.util.concurrent.Callable;
public class ESWriter extends Writer {
private final static String WRITE_COLUMNS = "write_columns";
public static class Job extends Writer.Job {
private static final Logger log = LoggerFactory.getLogger(Job.class);
private Configuration conf = null;
@Override
public void init() {
this.conf = super.getPluginJobConf();
}
@Override
public void prepare() {
/**
* 注意此方法仅执行一次
* 最佳实践如果 Job 中有需要进行数据同步之前的处理可以在此处完成如果没有必要则可以直接去掉
*/
ESClient esClient = new ESClient();
esClient.createClient(Key.getEndpoint(conf),
Key.getAccessID(conf),
Key.getAccessKey(conf),
false,
300000,
false,
false);
String indexName = Key.getIndexName(conf);
String typeName = Key.getTypeName(conf);
boolean dynamic = Key.getDynamic(conf);
String mappings = genMappings(typeName);
String settings = JSONObject.toJSONString(
Key.getSettings(conf)
);
log.info(String.format("index:[%s], type:[%s], mappings:[%s]", indexName, typeName, mappings));
try {
boolean isIndicesExists = esClient.indicesExists(indexName);
if (Key.isCleanup(this.conf) && isIndicesExists) {
esClient.deleteIndex(indexName);
}
// 强制创建,内部自动忽略已存在的情况
if (!esClient.createIndex(indexName, typeName, mappings, settings, dynamic)) {
throw new IOException("create index or mapping failed");
}
} catch (Exception ex) {
throw DataXException.asDataXException(ESWriterErrorCode.ES_MAPPINGS, ex.toString());
}
esClient.closeJestClient();
}
private String genMappings(String typeName) {
String mappings = null;
Map<String, Object> propMap = new HashMap<String, Object>();
List<ESColumn> columnList = new ArrayList<ESColumn>();
List column = conf.getList("column");
if (column != null) {
for (Object col : column) {
JSONObject jo = JSONObject.parseObject(col.toString());
String colName = jo.getString("name");
String colTypeStr = jo.getString("type");
if (colTypeStr == null) {
throw DataXException.asDataXException(ESWriterErrorCode.BAD_CONFIG_VALUE, col.toString() + " column must have type");
}
ESFieldType colType = ESFieldType.getESFieldType(colTypeStr);
if (colType == null) {
throw DataXException.asDataXException(ESWriterErrorCode.BAD_CONFIG_VALUE, col.toString() + " unsupported type");
}
ESColumn columnItem = new ESColumn();
if (colName.equals(Key.PRIMARY_KEY_COLUMN_NAME)) {
// 兼容已有版本
colType = ESFieldType.ID;
colTypeStr = "id";
}
columnItem.setName(colName);
columnItem.setType(colTypeStr);
if (colType == ESFieldType.ID) {
columnList.add(columnItem);
// 如果是id,则properties为空
continue;
}
Boolean array = jo.getBoolean("array");
if (array != null) {
columnItem.setArray(array);
}
Map<String, Object> field = new HashMap<String, Object>();
field.put("type", colTypeStr);
//https://www.elastic.co/guide/en/elasticsearch/reference/5.2/breaking_50_mapping_changes.html#_literal_index_literal_property
// https://www.elastic.co/guide/en/elasticsearch/guide/2.x/_deep_dive_on_doc_values.html#_disabling_doc_values
field.put("doc_values", jo.getBoolean("doc_values"));
field.put("ignore_above", jo.getInteger("ignore_above"));
field.put("index", jo.getBoolean("index"));
switch (colType) {
case STRING:
// 兼容string类型,ES5之前版本
break;
case KEYWORD:
// https://www.elastic.co/guide/en/elasticsearch/reference/current/tune-for-search-speed.html#_warm_up_global_ordinals
field.put("eager_global_ordinals", jo.getBoolean("eager_global_ordinals"));
case TEXT:
field.put("analyzer", jo.getString("analyzer"));
// 优化disk使用,也同步会提高index性能
// https://www.elastic.co/guide/en/elasticsearch/reference/current/tune-for-disk-usage.html
field.put("norms", jo.getBoolean("norms"));
field.put("index_options", jo.getBoolean("index_options"));
break;
case DATE:
columnItem.setTimeZone(jo.getString("timezone"));
columnItem.setFormat(jo.getString("format"));
// 后面时间会处理为带时区的标准时间,所以不需要给ES指定格式
/*
if (jo.getString("format") != null) {
field.put("format", jo.getString("format"));
} else {
//field.put("format", "strict_date_optional_time||epoch_millis||yyyy-MM-dd HH:mm:ss||yyyy-MM-dd");
}
*/
break;
case GEO_SHAPE:
field.put("tree", jo.getString("tree"));
field.put("precision", jo.getString("precision"));
default:
break;
}
propMap.put(colName, field);
columnList.add(columnItem);
}
}
conf.set(WRITE_COLUMNS, JSON.toJSONString(columnList));
log.info(JSON.toJSONString(columnList));
Map<String, Object> rootMappings = new HashMap<String, Object>();
Map<String, Object> typeMappings = new HashMap<String, Object>();
typeMappings.put("properties", propMap);
rootMappings.put(typeName, typeMappings);
mappings = JSON.toJSONString(rootMappings);
if (mappings == null || "".equals(mappings)) {
throw DataXException.asDataXException(ESWriterErrorCode.BAD_CONFIG_VALUE, "must have mappings");
}
return mappings;
}
@Override
public List<Configuration> split(int mandatoryNumber) {
List<Configuration> configurations = new ArrayList<Configuration>(mandatoryNumber);
for (int i = 0; i < mandatoryNumber; i++) {
configurations.add(conf);
}
return configurations;
}
@Override
public void post() {
ESClient esClient = new ESClient();
esClient.createClient(Key.getEndpoint(conf),
Key.getAccessID(conf),
Key.getAccessKey(conf),
false,
300000,
false,
false);
String alias = Key.getAlias(conf);
if (!"".equals(alias)) {
log.info(String.format("alias [%s] to [%s]", alias, Key.getIndexName(conf)));
try {
esClient.alias(Key.getIndexName(conf), alias, Key.isNeedCleanAlias(conf));
} catch (IOException e) {
throw DataXException.asDataXException(ESWriterErrorCode.ES_ALIAS_MODIFY, e);
}
}
}
@Override
public void destroy() {
}
}
public static class Task extends Writer.Task {
private static final Logger log = LoggerFactory.getLogger(Job.class);
private Configuration conf;
ESClient esClient = null;
private List<ESFieldType> typeList;
private List<ESColumn> columnList;
private int trySize;
private int batchSize;
private String index;
private String type;
private String splitter;
@Override
public void init() {
this.conf = super.getPluginJobConf();
index = Key.getIndexName(conf);
type = Key.getTypeName(conf);
trySize = Key.getTrySize(conf);
batchSize = Key.getBatchSize(conf);
splitter = Key.getSplitter(conf);
columnList = JSON.parseObject(this.conf.getString(WRITE_COLUMNS), new TypeReference<List<ESColumn>>() {
});
typeList = new ArrayList<ESFieldType>();
for (ESColumn col : columnList) {
typeList.add(ESFieldType.getESFieldType(col.getType()));
}
esClient = new ESClient();
}
@Override
public void prepare() {
esClient.createClient(Key.getEndpoint(conf),
Key.getAccessID(conf),
Key.getAccessKey(conf),
Key.isMultiThread(conf),
Key.getTimeout(conf),
Key.isCompression(conf),
Key.isDiscovery(conf));
}
@Override
public void startWrite(RecordReceiver recordReceiver) {
List<Record> writerBuffer = new ArrayList<Record>(this.batchSize);
Record record = null;
long total = 0;
while ((record = recordReceiver.getFromReader()) != null) {
writerBuffer.add(record);
if (writerBuffer.size() >= this.batchSize) {
total += doBatchInsert(writerBuffer);
writerBuffer.clear();
}
}
if (!writerBuffer.isEmpty()) {
total += doBatchInsert(writerBuffer);
writerBuffer.clear();
}
String msg = String.format("task end, write size :%d", total);
getTaskPluginCollector().collectMessage("writesize", String.valueOf(total));
log.info(msg);
esClient.closeJestClient();
}
private String getDateStr(ESColumn esColumn, Column column) {
DateTime date = null;
DateTimeZone dtz = DateTimeZone.getDefault();
if (esColumn.getTimezone() != null) {
// 所有时区参考 http://www.joda.org/joda-time/timezones.html
dtz = DateTimeZone.forID(esColumn.getTimezone());
}
if (column.getType() != Column.Type.DATE && esColumn.getFormat() != null) {
DateTimeFormatter formatter = DateTimeFormat.forPattern(esColumn.getFormat());
date = formatter.withZone(dtz).parseDateTime(column.asString());
return date.toString();
} else if (column.getType() == Column.Type.DATE) {
date = new DateTime(column.asLong(), dtz);
return date.toString();
} else {
return column.asString();
}
}
private long doBatchInsert(final List<Record> writerBuffer) {
Map<String, Object> data = null;
final Bulk.Builder bulkaction = new Bulk.Builder().defaultIndex(this.index).defaultType(this.type);
for (Record record : writerBuffer) {
data = new HashMap<String, Object>();
String id = null;
for (int i = 0; i < record.getColumnNumber(); i++) {
Column column = record.getColumn(i);
String columnName = columnList.get(i).getName();
ESFieldType columnType = typeList.get(i);
//如果是数组类型那它传入的必是字符串类型
if (columnList.get(i).isArray() != null && columnList.get(i).isArray()) {
String[] dataList = column.asString().split(splitter);
if (!columnType.equals(ESFieldType.DATE)) {
data.put(columnName, dataList);
} else {
for (int pos = 0; pos < dataList.length; pos++) {
dataList[pos] = getDateStr(columnList.get(i), column);
}
data.put(columnName, dataList);
}
} else {
switch (columnType) {
case ID:
if (id != null) {
id += record.getColumn(i).asString();
} else {
id = record.getColumn(i).asString();
}
break;
case DATE:
try {
String dateStr = getDateStr(columnList.get(i), column);
data.put(columnName, dateStr);
} catch (Exception e) {
getTaskPluginCollector().collectDirtyRecord(record, String.format("时间类型解析失败 [%s:%s] exception: %s", columnName, column.toString(), e.toString()));
}
break;
case KEYWORD:
case STRING:
case TEXT:
case IP:
case GEO_POINT:
data.put(columnName, column.asString());
break;
case BOOLEAN:
data.put(columnName, column.asBoolean());
break;
case BYTE:
case BINARY:
data.put(columnName, column.asBytes());
break;
case LONG:
data.put(columnName, column.asLong());
break;
case INTEGER:
data.put(columnName, column.asBigInteger());
break;
case SHORT:
data.put(columnName, column.asBigInteger());
break;
case FLOAT:
case DOUBLE:
data.put(columnName, column.asDouble());
break;
case NESTED:
case OBJECT:
case GEO_SHAPE:
data.put(columnName, JSON.parse(column.asString()));
break;
default:
getTaskPluginCollector().collectDirtyRecord(record, "类型错误:不支持的类型:" + columnType + " " + columnName);
}
}
}
if (id == null) {
//id = UUID.randomUUID().toString();
bulkaction.addAction(new Index.Builder(data).build());
} else {
bulkaction.addAction(new Index.Builder(data).id(id).build());
}
}
try {
return RetryUtil.executeWithRetry(new Callable<Integer>() {
@Override
public Integer call() throws Exception {
JestResult jestResult = esClient.bulkInsert(bulkaction, 1);
if (jestResult.isSucceeded()) {
return writerBuffer.size();
}
String msg = String.format("response code: [%d] error :[%s]", jestResult.getResponseCode(), jestResult.getErrorMessage());
log.warn(msg);
if (esClient.isBulkResult(jestResult)) {
BulkResult brst = (BulkResult) jestResult;
List<BulkResult.BulkResultItem> failedItems = brst.getFailedItems();
for (BulkResult.BulkResultItem item : failedItems) {
if (item.status != 400) {
// 400 BAD_REQUEST 如果非数据异常,请求异常,则不允许忽略
throw DataXException.asDataXException(ESWriterErrorCode.ES_INDEX_INSERT, String.format("status:[%d], error: %s", item.status, item.error));
} else {
// 如果用户选择不忽略解析错误,则抛异常,默认为忽略
if (!Key.isIgnoreParseError(conf)) {
throw DataXException.asDataXException(ESWriterErrorCode.ES_INDEX_INSERT, String.format("status:[%d], error: %s, config not ignoreParseError so throw this error", item.status, item.error));
}
}
}
List<BulkResult.BulkResultItem> items = brst.getItems();
for (int idx = 0; idx < items.size(); ++idx) {
BulkResult.BulkResultItem item = items.get(idx);
if (item.error != null && !"".equals(item.error)) {
getTaskPluginCollector().collectDirtyRecord(writerBuffer.get(idx), String.format("status:[%d], error: %s", item.status, item.error));
}
}
return writerBuffer.size() - brst.getFailedItems().size();
} else {
Integer status = esClient.getStatus(jestResult);
switch (status) {
case 429: //TOO_MANY_REQUESTS
log.warn("server response too many requests, so auto reduce speed");
break;
}
throw DataXException.asDataXException(ESWriterErrorCode.ES_INDEX_INSERT, jestResult.getErrorMessage());
}
}
}, trySize, 60000L, true);
} catch (Exception e) {
if (Key.isIgnoreWriteError(this.conf)) {
log.warn(String.format("重试[%d]次写入失败,忽略该错误,继续写入!", trySize));
} else {
throw DataXException.asDataXException(ESWriterErrorCode.ES_INDEX_INSERT, e);
}
}
return 0;
}
@Override
public void post() {
}
@Override
public void destroy() {
esClient.closeJestClient();
}
}
}

View File

@ -1,37 +0,0 @@
package com.alibaba.datax.plugin.writer.elasticsearchwriter;
import com.alibaba.datax.common.spi.ErrorCode;
public enum ESWriterErrorCode implements ErrorCode {
BAD_CONFIG_VALUE("ESWriter-00", "您配置的值不合法."),
ES_INDEX_DELETE("ESWriter-01", "删除index错误."),
ES_INDEX_CREATE("ESWriter-02", "创建index错误."),
ES_MAPPINGS("ESWriter-03", "mappings错误."),
ES_INDEX_INSERT("ESWriter-04", "插入数据错误."),
ES_ALIAS_MODIFY("ESWriter-05", "别名修改错误."),
;
private final String code;
private final String description;
ESWriterErrorCode(String code, String description) {
this.code = code;
this.description = description;
}
@Override
public String getCode() {
return this.code;
}
@Override
public String getDescription() {
return this.description;
}
@Override
public String toString() {
return String.format("Code:[%s], Description:[%s]. ", this.code,
this.description);
}
}

View File

@ -0,0 +1,312 @@
package com.alibaba.datax.plugin.writer.elasticsearchwriter;
import com.alibaba.datax.common.exception.DataXException;
import com.alibaba.datax.common.util.Configuration;
import com.alibaba.datax.plugin.writer.elasticsearchwriter.jest.ClusterInfo;
import com.alibaba.datax.plugin.writer.elasticsearchwriter.jest.ClusterInfoResult;
import com.alibaba.datax.plugin.writer.elasticsearchwriter.jest.PutMapping7;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import com.google.gson.Gson;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import io.searchbox.action.Action;
import io.searchbox.client.JestClient;
import io.searchbox.client.JestClientFactory;
import io.searchbox.client.JestResult;
import io.searchbox.client.config.HttpClientConfig;
import io.searchbox.client.config.HttpClientConfig.Builder;
import io.searchbox.core.Bulk;
import io.searchbox.indices.CreateIndex;
import io.searchbox.indices.DeleteIndex;
import io.searchbox.indices.IndicesExists;
import io.searchbox.indices.aliases.*;
import io.searchbox.indices.mapping.GetMapping;
import io.searchbox.indices.mapping.PutMapping;
import io.searchbox.indices.settings.GetSettings;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
/**
* Created by xiongfeng.bxf on 17/2/8.
*/
public class ElasticSearchClient {
private static final Logger LOGGER = LoggerFactory.getLogger(ElasticSearchClient.class);
private JestClient jestClient;
private Configuration conf;
public JestClient getClient() {
return jestClient;
}
public ElasticSearchClient(Configuration conf) {
this.conf = conf;
String endpoint = Key.getEndpoint(conf);
String user = Key.getUsername(conf);
String passwd = Key.getPassword(conf);
boolean multiThread = Key.isMultiThread(conf);
int readTimeout = Key.getTimeout(conf);
boolean compression = Key.isCompression(conf);
boolean discovery = Key.isDiscovery(conf);
String discoveryFilter = Key.getDiscoveryFilter(conf);
int totalConnection = this.conf.getInt("maxTotalConnection", 200);
JestClientFactory factory = new JestClientFactory();
Builder httpClientConfig = new HttpClientConfig
.Builder(endpoint)
// .setPreemptiveAuth(new HttpHost(endpoint))
.multiThreaded(multiThread)
.connTimeout(readTimeout)
.readTimeout(readTimeout)
.maxTotalConnection(totalConnection)
.requestCompressionEnabled(compression)
.discoveryEnabled(discovery)
.discoveryFrequency(5L, TimeUnit.MINUTES)
.discoveryFilter(discoveryFilter);
if (!(StringUtils.isBlank(user) || StringUtils.isBlank(passwd))) {
// 匿名登录
httpClientConfig.defaultCredentials(user, passwd);
}
factory.setHttpClientConfig(httpClientConfig.build());
this.jestClient = factory.getObject();
}
public boolean indicesExists(String indexName) throws Exception {
boolean isIndicesExists = false;
JestResult rst = execute(new IndicesExists.Builder(indexName).build());
if (rst.isSucceeded()) {
isIndicesExists = true;
} else {
LOGGER.warn("IndicesExists got ResponseCode: {} ErrorMessage: {}", rst.getResponseCode(), rst.getErrorMessage());
switch (rst.getResponseCode()) {
case 404:
isIndicesExists = false;
break;
case 401:
// 无权访问
default:
LOGGER.warn(rst.getErrorMessage());
break;
}
}
return isIndicesExists;
}
public boolean deleteIndex(String indexName) throws Exception {
LOGGER.info("delete index {}", indexName);
if (indicesExists(indexName)) {
JestResult rst = execute(new DeleteIndex.Builder(indexName).build());
if (!rst.isSucceeded()) {
LOGGER.warn("DeleteIndex got ResponseCode: {}, ErrorMessage: {}", rst.getResponseCode(), rst.getErrorMessage());
return false;
} else {
LOGGER.info("delete index {} success", indexName);
}
} else {
LOGGER.info("index cannot found, skip delete index {}", indexName);
}
return true;
}
public boolean isGreaterOrEqualThan7() throws Exception {
try {
ClusterInfoResult result = execute(new ClusterInfo.Builder().build());
LOGGER.info("ClusterInfoResult: {}", result.getJsonString());
return result.isGreaterOrEqualThan7();
}catch(Exception e) {
LOGGER.warn(e.getMessage());
return false;
}
}
/**
* 获取索引的settings
* @param indexName 索引名
* @return 设置
*/
public String getIndexSettings(String indexName) {
GetSettings.Builder builder = new GetSettings.Builder();
builder.addIndex(indexName);
GetSettings getSettings = builder.build();
try {
LOGGER.info("begin GetSettings for index: {}", indexName);
JestResult result = this.execute(getSettings);
return result.getJsonString();
} catch (Exception e) {
String message = "GetSettings for index error: " + e.getMessage();
LOGGER.warn(message, e);
throw DataXException.asDataXException(ElasticSearchWriterErrorCode.ES_GET_SETTINGS, e.getMessage(), e);
}
}
public boolean createIndexIfNotExists(String indexName, String typeName,
Object mappings, String settings,
boolean dynamic, boolean isGreaterOrEqualThan7) throws Exception {
JestResult rst;
if (!indicesExists(indexName)) {
LOGGER.info("create index {}", indexName);
rst = execute(
new CreateIndex.Builder(indexName)
.settings(settings)
.setParameter("master_timeout", Key.getMasterTimeout(this.conf))
.build()
);
//index_already_exists_exception
if (!rst.isSucceeded()) {
LOGGER.warn("CreateIndex got ResponseCode: {}, ErrorMessage: {}", rst.getResponseCode(), rst.getErrorMessage());
if (getStatus(rst) == 400) {
LOGGER.info(String.format("index {} already exists", indexName));
return true;
} else {
return false;
}
} else {
LOGGER.info("create {} index success", indexName);
}
}
if (dynamic) {
LOGGER.info("dynamic is true, ignore mappings");
return true;
}
LOGGER.info("create mappings for {} {}", indexName, mappings);
//如果大于7.xmapping的PUT请求URI中不能带type并且mapping设置中不能带有嵌套结构
if (isGreaterOrEqualThan7) {
rst = execute(new PutMapping7.Builder(indexName, mappings).
setParameter("master_timeout", Key.getMasterTimeout(this.conf)).build());
} else {
rst = execute(new PutMapping.Builder(indexName, typeName, mappings)
.setParameter("master_timeout", Key.getMasterTimeout(this.conf)).build());
}
if (!rst.isSucceeded()) {
LOGGER.error("PutMapping got ResponseCode: {}, ErrorMessage: {}", rst.getResponseCode(), rst.getErrorMessage());
return false;
} else {
LOGGER.info("index {} put mappings success", indexName);
}
return true;
}
public <T extends JestResult> T execute(Action<T> clientRequest) throws IOException {
T rst = jestClient.execute(clientRequest);
if (!rst.isSucceeded()) {
LOGGER.warn(rst.getJsonString());
}
return rst;
}
public Integer getStatus(JestResult rst) {
JsonObject jsonObject = rst.getJsonObject();
if (jsonObject.has("status")) {
return jsonObject.get("status").getAsInt();
}
return 600;
}
public boolean isBulkResult(JestResult rst) {
JsonObject jsonObject = rst.getJsonObject();
return jsonObject.has("items");
}
public boolean alias(String indexname, String aliasname, boolean needClean) throws IOException {
GetAliases getAliases = new GetAliases.Builder().addIndex(aliasname).build();
AliasMapping addAliasMapping = new AddAliasMapping.Builder(indexname, aliasname).build();
JestResult rst = null;
List<AliasMapping> list = new ArrayList<AliasMapping>();
if (needClean) {
rst = execute(getAliases);
if (rst.isSucceeded()) {
JsonParser jp = new JsonParser();
JsonObject jo = (JsonObject) jp.parse(rst.getJsonString());
for (Map.Entry<String, JsonElement> entry : jo.entrySet()) {
String tindex = entry.getKey();
if (indexname.equals(tindex)) {
continue;
}
AliasMapping m = new RemoveAliasMapping.Builder(tindex, aliasname).build();
String s = new Gson().toJson(m.getData());
LOGGER.info(s);
list.add(m);
}
}
}
ModifyAliases modifyAliases = new ModifyAliases.Builder(addAliasMapping).addAlias(list).setParameter("master_timeout", Key.getMasterTimeout(this.conf)).build();
rst = execute(modifyAliases);
if (!rst.isSucceeded()) {
LOGGER.error(rst.getErrorMessage());
throw new IOException(rst.getErrorMessage());
}
return true;
}
/**
* 获取index的mapping
*/
public String getIndexMapping(String indexName) {
GetMapping.Builder builder = new GetMapping.Builder();
builder.addIndex(indexName);
GetMapping getMapping = builder.build();
try {
LOGGER.info("begin GetMapping for index: {}", indexName);
JestResult result = this.execute(getMapping);
return result.getJsonString();
} catch (Exception e) {
String message = "GetMapping for index error: " + e.getMessage();
LOGGER.warn(message, e);
throw DataXException.asDataXException(ElasticSearchWriterErrorCode.ES_MAPPINGS, e.getMessage(), e);
}
}
public String getMappingForIndexType(String indexName, String typeName) {
String indexMapping = this.getIndexMapping(indexName);
JSONObject indexMappingInJson = JSON.parseObject(indexMapping);
List<String> paths = Arrays.asList(indexName, "mappings");
JSONObject properties = JsonPathUtil.getJsonObject(paths, indexMappingInJson);
JSONObject propertiesParent = properties;
if (StringUtils.isNotBlank(typeName) && properties.containsKey(typeName)) {
propertiesParent = (JSONObject) properties.get(typeName);
}
JSONObject mapping = (JSONObject) propertiesParent.get("properties");
return JSON.toJSONString(mapping);
}
public JestResult bulkInsert(Bulk.Builder bulk) throws Exception {
// es_rejected_execution_exception
// illegal_argument_exception
// cluster_block_exception
JestResult rst = null;
rst = execute(bulk.build());
if (!rst.isSucceeded()) {
LOGGER.warn(rst.getErrorMessage());
}
return rst;
}
/**
* 关闭JestClient客户端
*
*/
public void closeJestClient() {
if (jestClient != null) {
try {
// jestClient.shutdownClient();
jestClient.close();
} catch (IOException e) {
LOGGER.warn("ignore error: ", e.getMessage());
}
}
}
}

View File

@ -0,0 +1,126 @@
package com.alibaba.datax.plugin.writer.elasticsearchwriter;
import java.util.List;
/**
* Created by xiongfeng.bxf on 17/3/2.
*/
public class ElasticSearchColumn {
private String name;//: "appkey",
private String type;//": "TEXT",
private String timezone;
/**
* 源头数据格式化处理datax做的事情
*/
private String format;
/**
* 目标端格式化es原生支持的格式
*/
private String dstFormat;
private boolean array;
/**
* 是否使用目标端(ES原生)数组类型
*
* 默认是false
*/
private boolean dstArray = false;
private boolean jsonArray;
private boolean origin;
private List<String> combineFields;
private String combineFieldsValueSeparator = "-";
public String getCombineFieldsValueSeparator() {
return combineFieldsValueSeparator;
}
public void setCombineFieldsValueSeparator(String combineFieldsValueSeparator) {
this.combineFieldsValueSeparator = combineFieldsValueSeparator;
}
public List<String> getCombineFields() {
return combineFields;
}
public void setCombineFields(List<String> combineFields) {
this.combineFields = combineFields;
}
public void setName(String name) {
this.name = name;
}
public void setType(String type) {
this.type = type;
}
public void setTimeZone(String timezone) {
this.timezone = timezone;
}
public void setFormat(String format) {
this.format = format;
}
public String getName() {
return name;
}
public String getType() {
return type;
}
public boolean isOrigin() { return origin; }
public void setOrigin(boolean origin) { this.origin = origin; }
public String getTimezone() {
return timezone;
}
public String getFormat() {
return format;
}
public void setTimezone(String timezone) {
this.timezone = timezone;
}
public boolean isArray() {
return array;
}
public void setArray(boolean array) {
this.array = array;
}
public boolean isJsonArray() {return jsonArray;}
public void setJsonArray(boolean jsonArray) {this.jsonArray = jsonArray;}
public String getDstFormat() {
return dstFormat;
}
public void setDstFormat(String dstFormat) {
this.dstFormat = dstFormat;
}
public boolean isDstArray() {
return dstArray;
}
public void setDstArray(boolean dstArray) {
this.dstArray = dstArray;
}
}

View File

@ -3,8 +3,11 @@ package com.alibaba.datax.plugin.writer.elasticsearchwriter;
/** /**
* Created by xiongfeng.bxf on 17/3/1. * Created by xiongfeng.bxf on 17/3/1.
*/ */
public enum ESFieldType { public enum ElasticSearchFieldType {
ID, ID,
PARENT,
ROUTING,
VERSION,
STRING, STRING,
TEXT, TEXT,
KEYWORD, KEYWORD,
@ -24,20 +27,18 @@ public enum ESFieldType {
DATE_RANGE, DATE_RANGE,
GEO_POINT, GEO_POINT,
GEO_SHAPE, GEO_SHAPE,
IP, IP,
IP_RANGE,
COMPLETION, COMPLETION,
TOKEN_COUNT, TOKEN_COUNT,
ARRAY,
OBJECT, OBJECT,
NESTED; NESTED;
public static ESFieldType getESFieldType(String type) { public static ElasticSearchFieldType getESFieldType(String type) {
if (type == null) { if (type == null) {
return null; return null;
} }
for (ESFieldType f : ESFieldType.values()) { for (ElasticSearchFieldType f : ElasticSearchFieldType.values()) {
if (f.name().compareTo(type.toUpperCase()) == 0) { if (f.name().compareTo(type.toUpperCase()) == 0) {
return f; return f;
} }

View File

@ -0,0 +1,41 @@
package com.alibaba.datax.plugin.writer.elasticsearchwriter;
import com.alibaba.datax.common.spi.ErrorCode;
public enum ElasticSearchWriterErrorCode implements ErrorCode {
BAD_CONFIG_VALUE("ESWriter-00", "The value you configured is not valid."),
ES_INDEX_DELETE("ESWriter-01", "Delete index error."),
ES_INDEX_CREATE("ESWriter-02", "Index creation error."),
ES_MAPPINGS("ESWriter-03", "The mappings error."),
ES_INDEX_INSERT("ESWriter-04", "Insert data error."),
ES_ALIAS_MODIFY("ESWriter-05", "Alias modification error."),
JSON_PARSE("ESWrite-06", "Json format parsing error"),
UPDATE_WITH_ID("ESWrite-07", "Update mode must specify column type with id"),
RECORD_FIELD_NOT_FOUND("ESWrite-08", "Field does not exist in the original table"),
ES_GET_SETTINGS("ESWriter-09", "get settings failed");
;
private final String code;
private final String description;
ElasticSearchWriterErrorCode(String code, String description) {
this.code = code;
this.description = description;
}
@Override
public String getCode() {
return this.code;
}
@Override
public String getDescription() {
return this.description;
}
@Override
public String toString() {
return String.format("Code:[%s], Description:[%s]. ", this.code,
this.description);
}
}

View File

@ -0,0 +1,28 @@
package com.alibaba.datax.plugin.writer.elasticsearchwriter;
import java.util.List;
import com.alibaba.fastjson.JSONObject;
public class JsonPathUtil {
public static JSONObject getJsonObject(List<String> paths, JSONObject data) {
if (null == paths || paths.isEmpty()) {
return data;
}
if (null == data) {
return null;
}
JSONObject dataTmp = data;
for (String each : paths) {
if (null != dataTmp) {
dataTmp = dataTmp.getJSONObject(each);
} else {
return null;
}
}
return dataTmp;
}
}

View File

@ -0,0 +1,54 @@
package com.alibaba.datax.plugin.writer.elasticsearchwriter;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONException;
import com.alibaba.fastjson.JSONObject;
/**
* @author bozu
* @date 2021/01/06
*/
public class JsonUtil {
/**
* 合并两个json
* @param source 源json
* @param target 目标json
* @return 合并后的json
* @throws JSONException
*/
public static String mergeJsonStr(String source, String target) throws JSONException {
if(source == null) {
return target;
}
if(target == null) {
return source;
}
return JSON.toJSONString(deepMerge(JSON.parseObject(source), JSON.parseObject(target)));
}
/**
* 深度合并两个json对象将source的值merge到target中
* @param source 源json
* @param target 目标json
* @return 合并后的json
* @throws JSONException
*/
private static JSONObject deepMerge(JSONObject source, JSONObject target) throws JSONException {
for (String key: source.keySet()) {
Object value = source.get(key);
if (target.containsKey(key)) {
// existing value for "key" - recursively deep merge:
if (value instanceof JSONObject) {
JSONObject valueJson = (JSONObject)value;
deepMerge(valueJson, target.getJSONObject(key));
} else {
target.put(key, value);
}
} else {
target.put(key, value);
}
}
return target;
}
}

View File

@ -1,9 +1,13 @@
package com.alibaba.datax.plugin.writer.elasticsearchwriter; package com.alibaba.datax.plugin.writer.elasticsearchwriter;
import com.alibaba.datax.common.util.Configuration; import com.alibaba.datax.common.util.Configuration;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.TypeReference;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import java.util.HashMap; import java.util.HashMap;
import java.util.List;
import java.util.Map; import java.util.Map;
public final class Key { public final class Key {
@ -37,31 +41,35 @@ public final class Key {
public static String getEndpoint(Configuration conf) { public static String getEndpoint(Configuration conf) {
return conf.getNecessaryValue("endpoint", ESWriterErrorCode.BAD_CONFIG_VALUE); return conf.getNecessaryValue("endpoint", ElasticSearchWriterErrorCode.BAD_CONFIG_VALUE);
} }
public static String getAccessID(Configuration conf) { public static String getUsername(Configuration conf) {
return conf.getString("accessId", ""); return conf.getString("username", conf.getString("accessId"));
} }
public static String getAccessKey(Configuration conf) { public static String getPassword(Configuration conf) {
return conf.getString("accessKey", ""); return conf.getString("password", conf.getString("accessKey"));
} }
public static int getBatchSize(Configuration conf) { public static int getBatchSize(Configuration conf) {
return conf.getInt("batchSize", 1000); return conf.getInt("batchSize", 1024);
} }
public static int getTrySize(Configuration conf) { public static int getTrySize(Configuration conf) {
return conf.getInt("trySize", 30); return conf.getInt("trySize", 30);
} }
public static long getTryInterval(Configuration conf) {
return conf.getLong("tryInterval", 60000L);
}
public static int getTimeout(Configuration conf) { public static int getTimeout(Configuration conf) {
return conf.getInt("timeout", 600000); return conf.getInt("timeout", 600000);
} }
public static boolean isCleanup(Configuration conf) { public static boolean isTruncate(Configuration conf) {
return conf.getBool("cleanup", false); return conf.getBool("truncate", conf.getBool("cleanup", false));
} }
public static boolean isDiscovery(Configuration conf) { public static boolean isDiscovery(Configuration conf) {
@ -69,7 +77,7 @@ public final class Key {
} }
public static boolean isCompression(Configuration conf) { public static boolean isCompression(Configuration conf) {
return conf.getBool("compression", true); return conf.getBool("compress", conf.getBool("compression", true));
} }
public static boolean isMultiThread(Configuration conf) { public static boolean isMultiThread(Configuration conf) {
@ -77,9 +85,17 @@ public final class Key {
} }
public static String getIndexName(Configuration conf) { public static String getIndexName(Configuration conf) {
return conf.getNecessaryValue("index", ESWriterErrorCode.BAD_CONFIG_VALUE); return conf.getNecessaryValue("index", ElasticSearchWriterErrorCode.BAD_CONFIG_VALUE);
} }
public static String getDeleteBy(Configuration conf) {
return conf.getString("deleteBy");
}
/**
* TODO: 在7.0开始一个索引只能建一个Type为_doc
* */
public static String getTypeName(Configuration conf) { public static String getTypeName(Configuration conf) {
String indexType = conf.getString("indexType"); String indexType = conf.getString("indexType");
if(StringUtils.isBlank(indexType)){ if(StringUtils.isBlank(indexType)){
@ -128,4 +144,58 @@ public final class Key {
public static boolean getDynamic(Configuration conf) { public static boolean getDynamic(Configuration conf) {
return conf.getBool("dynamic", false); return conf.getBool("dynamic", false);
} }
public static String getDstDynamic(Configuration conf) {
return conf.getString("dstDynamic");
}
public static String getDiscoveryFilter(Configuration conf){
return conf.getString("discoveryFilter","_all");
}
public static Boolean getVersioning(Configuration conf) {
return conf.getBool("versioning", false);
}
public static Long getUnifiedVersion(Configuration conf) {
return conf.getLong("version", System.currentTimeMillis());
}
public static Map<String, Object> getUrlParams(Configuration conf) {
return conf.getMap("urlParams", new HashMap<String, Object>());
}
public static Integer getESVersion(Configuration conf) {
return conf.getInt("esVersion");
}
public static String getMasterTimeout(Configuration conf) {
return conf.getString("masterTimeout", "5m");
}
public static boolean isEnableNullUpdate(Configuration conf) {
return conf.getBool("enableWriteNull", true);
}
public static String getFieldDelimiter(Configuration conf) {
return conf.getString("fieldDelimiter", "");
}
public static PrimaryKeyInfo getPrimaryKeyInfo(Configuration conf) {
String primaryKeyInfoString = conf.getString("primaryKeyInfo");
if (StringUtils.isNotBlank(primaryKeyInfoString)) {
return JSON.parseObject(primaryKeyInfoString, new TypeReference<PrimaryKeyInfo>() {});
} else {
return null;
}
}
public static List<PartitionColumn> getEsPartitionColumn(Configuration conf) {
String esPartitionColumnString = conf.getString("esPartitionColumn");
if (StringUtils.isNotBlank(esPartitionColumnString)) {
return JSON.parseObject(esPartitionColumnString, new TypeReference<List<PartitionColumn>>() {});
} else {
return null;
}
}
} }

View File

@ -0,0 +1,16 @@
package com.alibaba.datax.plugin.writer.elasticsearchwriter;
import com.alibaba.datax.common.exception.DataXException;
import com.alibaba.datax.common.spi.ErrorCode;
public class NoReRunException extends DataXException {
public NoReRunException(String errorMessage) {
super(errorMessage);
}
public NoReRunException(ErrorCode errorCode, String errorMessage) {
super(errorCode, errorMessage);
}
private static final long serialVersionUID = 1L;
}

View File

@ -0,0 +1,42 @@
package com.alibaba.datax.plugin.writer.elasticsearchwriter;
public class PartitionColumn {
private String name;
// like: DATA
private String metaType;
private String comment;
// like: VARCHAR
private String type;
public String getName() {
return name;
}
public String getMetaType() {
return metaType;
}
public String getComment() {
return comment;
}
public String getType() {
return type;
}
public void setName(String name) {
this.name = name;
}
public void setMetaType(String metaType) {
this.metaType = metaType;
}
public void setComment(String comment) {
this.comment = comment;
}
public void setType(String type) {
this.type = type;
}
}

View File

@ -0,0 +1,47 @@
package com.alibaba.datax.plugin.writer.elasticsearchwriter;
import java.util.List;
public class PrimaryKeyInfo {
/**
* 主键类型:PrimaryKeyTypeEnum
*
* pk: 单个(业务)主键 specific: 联合主键
*/
private String type;
/**
* 用户定义的联合主键的连接符号
*/
private String fieldDelimiter;
/**
* 主键的列的名称
*/
private List<String> column;
public String getType() {
return type;
}
public String getFieldDelimiter() {
return fieldDelimiter;
}
public List<String> getColumn() {
return column;
}
public void setType(String type) {
this.type = type;
}
public void setFieldDelimiter(String fieldDelimiter) {
this.fieldDelimiter = fieldDelimiter;
}
public void setColumn(List<String> column) {
this.column = column;
}
}

View File

@ -0,0 +1,35 @@
package com.alibaba.datax.plugin.writer.elasticsearchwriter.jest;
import com.google.gson.Gson;
import io.searchbox.action.AbstractAction;
import io.searchbox.client.config.ElasticsearchVersion;
public class ClusterInfo extends AbstractAction<ClusterInfoResult> {
@Override
protected String buildURI(ElasticsearchVersion elasticsearchVersion) {
return "";
}
@Override
public String getRestMethodName() {
return "GET";
}
@Override
public ClusterInfoResult createNewElasticSearchResult(String responseBody, int statusCode, String reasonPhrase, Gson gson) {
return createNewElasticSearchResult(new ClusterInfoResult(gson), responseBody, statusCode, reasonPhrase, gson);
}
public static class Builder extends AbstractAction.Builder<ClusterInfo, ClusterInfo.Builder> {
public Builder() {
setHeader("accept", "application/json");
setHeader("content-type", "application/json");
}
@Override
public ClusterInfo build() {
return new ClusterInfo();
}
}
}

View File

@ -0,0 +1,49 @@
package com.alibaba.datax.plugin.writer.elasticsearchwriter.jest;
import com.google.gson.Gson;
import io.searchbox.client.JestResult;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class ClusterInfoResult extends JestResult {
private static final Pattern FIRST_NUMBER = Pattern.compile("\\d");
private static final int SEVEN = 7;
public ClusterInfoResult(Gson gson) {
super(gson);
}
public ClusterInfoResult(JestResult source) {
super(source);
}
/**
* 判断es集群的部署版本是否大于7.x
* 大于7.x的es对于Index的type有较大改动需要做额外判定
* 对于7.x与6.x版本的es都做过测试返回符合预期;5.x以下版本直接try-catch后返回false向下兼容
* @return
*/
public Boolean isGreaterOrEqualThan7() throws Exception {
// 如果是没有权限直接返回false兼容老版本
if (responseCode == 403) {
return false;
}
if (!isSucceeded) {
throw new Exception(getJsonString());
}
try {
String version = jsonObject.getAsJsonObject("version").get("number").toString();
Matcher matcher = FIRST_NUMBER.matcher(version);
matcher.find();
String number = matcher.group();
Integer versionNum = Integer.valueOf(number);
return versionNum >= SEVEN;
} catch (Exception e) {
//5.x 以下版本不做兼容测试如果返回json格式解析失败有可能是以下版本所以认为不大于7.x
return false;
}
}
}

View File

@ -0,0 +1,39 @@
package com.alibaba.datax.plugin.writer.elasticsearchwriter.jest;
import io.searchbox.action.GenericResultAbstractAction;
import io.searchbox.client.config.ElasticsearchVersion;
public class PutMapping7 extends GenericResultAbstractAction {
protected PutMapping7(PutMapping7.Builder builder) {
super(builder);
this.indexName = builder.index;
this.payload = builder.source;
}
@Override
protected String buildURI(ElasticsearchVersion elasticsearchVersion) {
return super.buildURI(elasticsearchVersion) + "/_mapping";
}
@Override
public String getRestMethodName() {
return "PUT";
}
public static class Builder extends GenericResultAbstractAction.Builder<PutMapping7, PutMapping7.Builder> {
private String index;
private Object source;
public Builder(String index, Object source) {
this.index = index;
this.source = source;
}
@Override
public PutMapping7 build() {
return new PutMapping7(this);
}
}
}

View File

@ -1,6 +1,6 @@
{ {
"name": "elasticsearchwriter", "name": "elasticsearchwriter",
"class": "com.alibaba.datax.plugin.writer.elasticsearchwriter.ESWriter", "class": "com.alibaba.datax.plugin.writer.elasticsearchwriter.ElasticSearchWriter",
"description": "适用于: 生产环境. 原理: TODO", "description": "适用于: 生产环境. 原理: TODO",
"developer": "alibaba" "developer": "alibaba"
} }

View File

@ -175,7 +175,7 @@ public class HBase20SQLReaderHelper {
if (querySql == null || querySql.isEmpty()) { if (querySql == null || querySql.isEmpty()) {
// 如果splitPoints为空则根据splitKey自动切分不过这种切分方式无法保证数据均分且只支持整形和字符型列 // 如果splitPoints为空则根据splitKey自动切分不过这种切分方式无法保证数据均分且只支持整形和字符型列
if (splitPoints == null || splitPoints.isEmpty()) { if (splitPoints == null || splitPoints.isEmpty()) {
LOG.info("Split accoring min and max value of splitColumn..."); LOG.info("Split according min and max value of splitColumn...");
Pair<Object, Object> minMaxPK = getPkRange(configuration); Pair<Object, Object> minMaxPK = getPkRange(configuration);
if (null == minMaxPK) { if (null == minMaxPK) {
throw DataXException.asDataXException(HBase20xSQLReaderErrorCode.ILLEGAL_SPLIT_PK, throw DataXException.asDataXException(HBase20xSQLReaderErrorCode.ILLEGAL_SPLIT_PK,
@ -208,7 +208,7 @@ public class HBase20SQLReaderHelper {
} }
} else { } else {
LOG.info("Split accoring splitPoints..."); LOG.info("Split according splitPoints...");
// 根据指定splitPoints进行切分 // 根据指定splitPoints进行切分
rangeList = buildSplitRange(); rangeList = buildSplitRange();
} }

View File

@ -2,6 +2,6 @@
"name": "hbase20xsqlreader", "name": "hbase20xsqlreader",
"class": "com.alibaba.datax.plugin.reader.hbase20xsqlreader.HBase20xSQLReader", "class": "com.alibaba.datax.plugin.reader.hbase20xsqlreader.HBase20xSQLReader",
"description": "useScene: prod. mechanism: read data from phoenix through queryserver.", "description": "useScene: prod. mechanism: read data from phoenix through queryserver.",
"developer": "bake" "developer": "alibaba"
} }

View File

@ -2,6 +2,6 @@
"name": "hbase20xsqlwriter", "name": "hbase20xsqlwriter",
"class": "com.alibaba.datax.plugin.writer.hbase20xsqlwriter.HBase20xSQLWriter", "class": "com.alibaba.datax.plugin.writer.hbase20xsqlwriter.HBase20xSQLWriter",
"description": "useScene: prod. mechanism: use hbase sql UPSERT to put data, index tables will be updated too.", "description": "useScene: prod. mechanism: use hbase sql UPSERT to put data, index tables will be updated too.",
"developer": "bake" "developer": "alibaba"
} }

View File

@ -231,6 +231,7 @@ HdfsWriter提供向HDFS文件系统指定路径中写入TEXTFile文件和ORCFile
* append写入前不做任何处理DataX hdfswriter直接使用filename写入并保证文件名不冲突。 * append写入前不做任何处理DataX hdfswriter直接使用filename写入并保证文件名不冲突。
* nonConflict如果目录下有fileName前缀的文件直接报错。 * nonConflict如果目录下有fileName前缀的文件直接报错。
* truncate如果目录下有fileName前缀的文件先删除后写入。
* 必选:是 <br /> * 必选:是 <br />

View File

@ -1,6 +1,6 @@
{ {
"name": "hologreswriter", "name": "hologresjdbcwriter",
"class": "com.alibaba.datax.plugin.writer.hologreswriter.HologresWriter", "class": "com.alibaba.datax.plugin.writer.hologresjdbcwriter.HologresJdbcWriter",
"description": "", "description": "",
"developer": "alibaba" "developer": "alibaba"
} }

View File

@ -1,5 +1,5 @@
{ {
"name": "hologreswriter", "name": "hologresjdbcwriter",
"parameter": { "parameter": {
"url": "", "url": "",
"username": "", "username": "",

73
loghubreader/pom.xml Normal file
View File

@ -0,0 +1,73 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>datax-all</artifactId>
<groupId>com.alibaba.datax</groupId>
<version>0.0.1-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>loghubreader</artifactId>
<version>0.0.1-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>com.alibaba.datax</groupId>
<artifactId>datax-common</artifactId>
<version>${datax-project-version}</version>
<exclusions>
<exclusion>
<artifactId>slf4j-log4j12</artifactId>
<groupId>org.slf4j</groupId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
</dependency>
<dependency>
<groupId>ch.qos.logback</groupId>
<artifactId>logback-classic</artifactId>
</dependency>
<dependency>
<groupId>com.aliyun.openservices</groupId>
<artifactId>aliyun-log</artifactId>
<version>0.6.22</version>
</dependency>
</dependencies>
<build>
<plugins>
<!-- compiler plugin -->
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>${jdk-version}</source>
<target>${jdk-version}</target>
<encoding>${project-sourceEncoding}</encoding>
</configuration>
</plugin>
<!-- assembly plugin -->
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<descriptors>
<descriptor>src/main/assembly/package.xml</descriptor>
</descriptors>
<finalName>datax</finalName>
</configuration>
<executions>
<execution>
<id>dwzip</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>

View File

@ -0,0 +1,34 @@
<assembly
xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0 http://maven.apache.org/xsd/assembly-1.1.0.xsd">
<id></id>
<formats>
<format>dir</format>
</formats>
<includeBaseDirectory>false</includeBaseDirectory>
<fileSets>
<fileSet>
<directory>src/main/resources</directory>
<includes>
<include>plugin.json</include>
</includes>
<outputDirectory>plugin/reader/loghubreader</outputDirectory>
</fileSet>
<fileSet>
<directory>target/</directory>
<includes>
<include>loghubreader-0.0.1-SNAPSHOT.jar</include>
</includes>
<outputDirectory>plugin/reader/loghubreader</outputDirectory>
</fileSet>
</fileSets>
<dependencySets>
<dependencySet>
<useProjectArtifact>false</useProjectArtifact>
<outputDirectory>plugin/reader/loghubreader/libs</outputDirectory>
<scope>runtime</scope>
</dependencySet>
</dependencySets>
</assembly>

View File

@ -0,0 +1,26 @@
package com.alibaba.datax.plugin.reader.loghubreader;
public class Constant {
public static String DATETIME_FORMAT = "yyyyMMddHHmmss";
public static String DATE_FORMAT = "yyyyMMdd";
static String META_COL_SOURCE = "__source__";
static String META_COL_TOPIC = "__topic__";
static String META_COL_CATEGORY = "__category__";
static String META_COL_MACHINEUUID = "__machineUUID__";
static String META_COL_HOSTNAME = "__hostname__";
static String META_COL_PATH = "__path__";
static String META_COL_LOGTIME = "__logtime__";
public static String META_COL_RECEIVE_TIME = "__receive_time__";
/**
* 除用户手动配置的列之外其余数据列作为一个 json 读取到一列
*/
static String COL_EXTRACT_OTHERS = "C__extract_others__";
/**
* 将所有元数据列作为一个 json 读取到一列
*/
static String COL_EXTRACT_ALL_META = "C__extract_all_meta__";
}

View File

@ -0,0 +1,38 @@
package com.alibaba.datax.plugin.reader.loghubreader;
public final class Key {
/**
* 此处声明插件用到的需要插件使用者提供的配置项
*/
public static final String ENDPOINT = "endpoint";
public static final String ACCESSKEYID = "accessId";
public static final String ACCESSKEYSECRET = "accessKey";
public static final String PROJECT = "project";
public static final String LOGSTORE = "logstore";
public static final String TOPIC = "topic";
public static final String COLUMN = "column";
public static final String BATCHSIZE = "batchSize";
public static final String BEGINTIMESTAMPMILLIS = "beginTimestampMillis";
public static final String ENDTIMESTAMPMILLIS = "endTimestampMillis";
public static final String BEGINDATETIME = "beginDateTime";
public static final String ENDDATETIME = "endDateTime";
public static final String TIMEFORMAT = "timeformat";
public static final String SOURCE = "source";
public static final String SHARD = "shard";
}

View File

@ -0,0 +1,482 @@
package com.alibaba.datax.plugin.reader.loghubreader;
import com.alibaba.datax.common.element.Record;
import com.alibaba.datax.common.element.StringColumn;
import com.alibaba.datax.common.exception.DataXException;
import com.alibaba.datax.common.plugin.RecordSender;
import com.alibaba.datax.common.spi.Reader;
import com.alibaba.datax.common.util.Configuration;
import com.alibaba.datax.common.util.DataXCaseEnvUtil;
import com.alibaba.datax.common.util.RetryUtil;
import com.alibaba.fastjson.JSONObject;
import com.aliyun.openservices.log.Client;
import com.aliyun.openservices.log.common.Consts.CursorMode;
import com.aliyun.openservices.log.common.*;
import com.aliyun.openservices.log.exception.LogException;
import com.aliyun.openservices.log.response.BatchGetLogResponse;
import com.aliyun.openservices.log.response.GetCursorResponse;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.*;
import java.util.concurrent.Callable;
public class LogHubReader extends Reader {
public static class Job extends Reader.Job {
private static final Logger LOG = LoggerFactory.getLogger(Job.class);
private Client client;
private Configuration originalConfig;
private Long beginTimestampMillis;
private Long endTimestampMillis;
@Override
public void init() {
LOG.info("loghub reader job init begin ...");
this.originalConfig = super.getPluginJobConf();
validateParameter(originalConfig);
String endPoint = this.originalConfig.getString(Key.ENDPOINT);
String accessKeyId = this.originalConfig.getString(Key.ACCESSKEYID);
String accessKeySecret = this.originalConfig.getString(Key.ACCESSKEYSECRET);
client = new Client(endPoint, accessKeyId, accessKeySecret);
LOG.info("loghub reader job init end.");
}
private void validateParameter(Configuration conf){
conf.getNecessaryValue(Key.ENDPOINT,LogHubReaderErrorCode.REQUIRE_VALUE);
conf.getNecessaryValue(Key.ACCESSKEYID,LogHubReaderErrorCode.REQUIRE_VALUE);
conf.getNecessaryValue(Key.ACCESSKEYSECRET,LogHubReaderErrorCode.REQUIRE_VALUE);
conf.getNecessaryValue(Key.PROJECT,LogHubReaderErrorCode.REQUIRE_VALUE);
conf.getNecessaryValue(Key.LOGSTORE,LogHubReaderErrorCode.REQUIRE_VALUE);
conf.getNecessaryValue(Key.COLUMN,LogHubReaderErrorCode.REQUIRE_VALUE);
int batchSize = this.originalConfig.getInt(Key.BATCHSIZE);
if (batchSize > 1000) {
throw DataXException.asDataXException(LogHubReaderErrorCode.BAD_CONFIG_VALUE,
"Invalid batchSize[" + batchSize + "] value (0,1000]!");
}
beginTimestampMillis = this.originalConfig.getLong(Key.BEGINTIMESTAMPMILLIS);
String beginDateTime = this.originalConfig.getString(Key.BEGINDATETIME);
if (beginDateTime != null) {
try {
beginTimestampMillis = getUnixTimeFromDateTime(beginDateTime);
} catch (ParseException e) {
throw DataXException.asDataXException(LogHubReaderErrorCode.BAD_CONFIG_VALUE,
"Invalid beginDateTime[" + beginDateTime + "], format [yyyyMMddHHmmss or yyyyMMdd]!");
}
}
if (beginTimestampMillis != null && beginTimestampMillis <= 0) {
throw DataXException.asDataXException(LogHubReaderErrorCode.BAD_CONFIG_VALUE,
"Invalid beginTimestampMillis[" + beginTimestampMillis + "]!");
}
endTimestampMillis = this.originalConfig.getLong(Key.ENDTIMESTAMPMILLIS);
String endDateTime = this.originalConfig.getString(Key.ENDDATETIME);
if (endDateTime != null) {
try {
endTimestampMillis = getUnixTimeFromDateTime(endDateTime);
} catch (ParseException e) {
throw DataXException.asDataXException(LogHubReaderErrorCode.BAD_CONFIG_VALUE,
"Invalid beginDateTime[" + endDateTime + "], format [yyyyMMddHHmmss or yyyyMMdd]!");
}
}
if (endTimestampMillis != null && endTimestampMillis <= 0) {
throw DataXException.asDataXException(LogHubReaderErrorCode.BAD_CONFIG_VALUE,
"Invalid endTimestampMillis[" + endTimestampMillis + "]!");
}
if (beginTimestampMillis != null && endTimestampMillis != null
&& endTimestampMillis <= beginTimestampMillis) {
throw DataXException.asDataXException(LogHubReaderErrorCode.BAD_CONFIG_VALUE,
"endTimestampMillis[" + endTimestampMillis + "] must bigger than beginTimestampMillis[" + beginTimestampMillis + "]!");
}
}
private long getUnixTimeFromDateTime(String dateTime) throws ParseException {
try {
String format = Constant.DATETIME_FORMAT;
SimpleDateFormat simpleDateFormat = new SimpleDateFormat(format);
return simpleDateFormat.parse(dateTime).getTime() / 1000;
} catch (ParseException ignored) {
throw DataXException.asDataXException(LogHubReaderErrorCode.BAD_CONFIG_VALUE,
"Invalid DateTime[" + dateTime + "]!");
}
}
@Override
public void prepare() {
}
@Override
public List<Configuration> split(int adviceNumber) {
LOG.info("split() begin...");
List<Configuration> readerSplitConfigs = new ArrayList<Configuration>();
final String project = this.originalConfig.getString(Key.PROJECT);
final String logstore = this.originalConfig.getString(Key.LOGSTORE);
List<Shard> logStore = null;
try {
logStore = RetryUtil.executeWithRetry(new Callable<List<Shard>>() {
@Override
public List<Shard> call() throws Exception {
return client.ListShard(project, logstore).GetShards();
}
}, DataXCaseEnvUtil.getRetryTimes(7), DataXCaseEnvUtil.getRetryInterval(1000L), DataXCaseEnvUtil.getRetryExponential(true));
} catch (Exception e) {
throw DataXException.asDataXException(LogHubReaderErrorCode.BAD_CONFIG_VALUE,
"get LogStore[" + logstore + "] error, please check ! detail error messsage: " + e.toString());
}
if (logStore == null) {
throw DataXException.asDataXException(LogHubReaderErrorCode.BAD_CONFIG_VALUE,
"LogStore[" + logstore + "] isn't exists, please check !");
}
int splitNumber = logStore.size();
if (0 == splitNumber) {
throw DataXException.asDataXException(LogHubReaderErrorCode.EMPTY_LOGSTORE_VALUE,
"LogStore[" + logstore + "] has 0 shard, please check !");
}
Collections.shuffle(logStore);
for (int i = 0; i < logStore.size(); i++) {
if (beginTimestampMillis != null && endTimestampMillis != null) {
try {
String beginCursor = getCursorWithRetry(client, project, logstore, logStore.get(i).GetShardId(), beginTimestampMillis).GetCursor();
String endCursor = getCursorWithRetry(client, project, logstore, logStore.get(i).GetShardId(), endTimestampMillis).GetCursor();
if (beginCursor.equals(endCursor)) {
if ((i == logStore.size() - 1) && (readerSplitConfigs.size() == 0)) {
} else {
LOG.info("skip empty shard[" + logStore.get(i) + "]!");
continue;
}
}
} catch (Exception e) {
LOG.error("Check Shard[" + logStore.get(i) + "] Error, please check !" + e.toString());
throw DataXException.asDataXException(LogHubReaderErrorCode.LOG_HUB_ERROR, e);
}
}
Configuration splitedConfig = this.originalConfig.clone();
splitedConfig.set(Key.SHARD, logStore.get(i).GetShardId());
readerSplitConfigs.add(splitedConfig);
}
if (splitNumber < adviceNumber) {
// LOG.info(MESSAGE_SOURCE.message("hdfsreader.12",
// splitNumber, adviceNumber, splitNumber, splitNumber));
}
LOG.info("split() ok and end...");
return readerSplitConfigs;
}
@Override
public void post() {
}
@Override
public void destroy() {
}
private GetCursorResponse getCursorWithRetry(final Client client, final String project, final String logstore, final int shard, final long fromTime) throws Exception {
return
RetryUtil.executeWithRetry(new Callable<GetCursorResponse>() {
@Override
public GetCursorResponse call() throws Exception {
LOG.info("loghug get cursor with project: {} logstore: {} shard: {} time: {}", project, logstore, shard, fromTime);
return client.GetCursor(project, logstore, shard, fromTime);
}
}, 7, 1000L, true);
}
}
public static class Task extends Reader.Task {
private static final Logger LOG = LoggerFactory.getLogger(Task.class);
private Configuration taskConfig;
private Client client;
private String endPoint;
private String accessKeyId;
private String accessKeySecret;
private String project;
private String logstore;
private long beginTimestampMillis;
private long endTimestampMillis;
private int batchSize;
private int shard;
private List<String> columns;
@Override
public void init() {
this.taskConfig = super.getPluginJobConf();
endPoint = this.taskConfig.getString(Key.ENDPOINT);
accessKeyId = this.taskConfig.getString(Key.ACCESSKEYID);
accessKeySecret = this.taskConfig.getString(Key.ACCESSKEYSECRET);
project = this.taskConfig.getString(Key.PROJECT);
logstore = this.taskConfig.getString(Key.LOGSTORE);
batchSize = this.taskConfig.getInt(Key.BATCHSIZE, 128);
this.beginTimestampMillis = this.taskConfig.getLong(Key.BEGINTIMESTAMPMILLIS, -1);
String beginDateTime = this.taskConfig.getString(Key.BEGINDATETIME);
if (beginDateTime != null) {
try {
beginTimestampMillis = getUnixTimeFromDateTime(beginDateTime);
} catch (ParseException e) {
}
}
this.endTimestampMillis = this.taskConfig.getLong(Key.ENDTIMESTAMPMILLIS, -1);
String endDateTime = this.taskConfig.getString(Key.ENDDATETIME);
if (endDateTime != null) {
try {
endTimestampMillis = getUnixTimeFromDateTime(endDateTime);
} catch (ParseException e) {
}
}
columns = this.taskConfig.getList(Key.COLUMN, String.class);
shard = this.taskConfig.getInt(Key.SHARD);
client = new Client(endPoint, accessKeyId, accessKeySecret);
LOG.info("init loghub reader task finished.project:{} logstore:{} batchSize:{}", project, logstore, batchSize);
}
@Override
public void prepare() {
}
private long getUnixTimeFromDateTime(String dateTime) throws ParseException {
try {
String format = Constant.DATETIME_FORMAT;
SimpleDateFormat simpleDateFormat = new SimpleDateFormat(format);
return simpleDateFormat.parse(dateTime).getTime() / 1000;
} catch (ParseException ignored) {
}
String format = Constant.DATE_FORMAT;
SimpleDateFormat simpleDateFormat = new SimpleDateFormat(format);
return simpleDateFormat.parse(dateTime).getTime() / 1000;
}
private GetCursorResponse getCursorWithRetry(final Client client, final String project, final String logstore, final int shard, final long fromTime) throws Exception {
return
RetryUtil.executeWithRetry(new Callable<GetCursorResponse>() {
@Override
public GetCursorResponse call() throws Exception {
LOG.info("loghug get cursor with project: {} logstore: {} shard: {} time: {}", project, logstore, shard, fromTime);
return client.GetCursor(project, logstore, shard, fromTime);
}
}, 7, 1000L, true);
}
private GetCursorResponse getCursorWithRetry(final Client client, final String project, final String logstore, final int shard, final CursorMode mode) throws Exception {
return
RetryUtil.executeWithRetry(new Callable<GetCursorResponse>() {
@Override
public GetCursorResponse call() throws Exception {
LOG.info("loghug get cursor with project: {} logstore: {} shard: {} mode: {}", project, logstore, shard, mode);
return client.GetCursor(project, logstore, shard, mode);
}
}, 7, 1000L, true);
}
private BatchGetLogResponse batchGetLogWithRetry(final Client client, final String project, final String logstore, final int shard, final int batchSize,
final String curCursor, final String endCursor) throws Exception {
return
RetryUtil.executeWithRetry(new Callable<BatchGetLogResponse>() {
@Override
public BatchGetLogResponse call() throws Exception {
return client.BatchGetLog(project, logstore, shard, batchSize, curCursor, endCursor);
}
}, 7, 1000L, true);
}
@Override
public void startRead(RecordSender recordSender) {
LOG.info("read start");
try {
GetCursorResponse cursorRes;
if (this.beginTimestampMillis != -1) {
cursorRes = getCursorWithRetry(client, project, logstore, this.shard, beginTimestampMillis);
} else {
cursorRes = getCursorWithRetry(client, project, logstore, this.shard, CursorMode.BEGIN);
}
String beginCursor = cursorRes.GetCursor();
LOG.info("the begin cursor, loghub requestId: {} cursor: {}", cursorRes.GetRequestId(), cursorRes.GetCursor());
if (this.endTimestampMillis != -1) {
cursorRes = getCursorWithRetry(client, project, logstore, this.shard, endTimestampMillis);
} else {
cursorRes = getCursorWithRetry(client, project, logstore, this.shard, CursorMode.END);
}
String endCursor = cursorRes.GetCursor();
LOG.info("the end cursor, loghub requestId: {} cursor: {}", cursorRes.GetRequestId(), cursorRes.GetCursor());
if (StringUtils.equals(beginCursor, endCursor)) {
LOG.info("beginCursor:{} equals endCursor:{}, end directly!", beginCursor, endCursor);
return;
}
String currentCursor = null;
String nextCursor = beginCursor;
HashMap<String, String> metaMap = new HashMap<String, String>();
HashMap<String, String> dataMap = new HashMap<String, String>();
JSONObject allMetaJson = new JSONObject();
while (!StringUtils.equals(currentCursor, nextCursor)) {
currentCursor = nextCursor;
BatchGetLogResponse logDataRes = batchGetLogWithRetry(client, project, logstore, this.shard, this.batchSize, currentCursor, endCursor);
List<LogGroupData> logGroups = logDataRes.GetLogGroups();
for(LogGroupData logGroup: logGroups) {
metaMap.clear();
allMetaJson.clear();
FastLogGroup flg = logGroup.GetFastLogGroup();
metaMap.put("C_Category", flg.getCategory());
metaMap.put(Constant.META_COL_CATEGORY, flg.getCategory());
allMetaJson.put(Constant.META_COL_CATEGORY, flg.getCategory());
metaMap.put("C_Source", flg.getSource());
metaMap.put(Constant.META_COL_SOURCE, flg.getSource());
allMetaJson.put(Constant.META_COL_SOURCE, flg.getSource());
metaMap.put("C_Topic", flg.getTopic());
metaMap.put(Constant.META_COL_TOPIC, flg.getTopic());
allMetaJson.put(Constant.META_COL_TOPIC, flg.getTopic());
metaMap.put("C_MachineUUID", flg.getMachineUUID());
metaMap.put(Constant.META_COL_MACHINEUUID, flg.getMachineUUID());
allMetaJson.put(Constant.META_COL_MACHINEUUID, flg.getMachineUUID());
for (int tagIdx = 0; tagIdx < flg.getLogTagsCount(); ++tagIdx) {
FastLogTag logtag = flg.getLogTags(tagIdx);
String tagKey = logtag.getKey();
String tagValue = logtag.getValue();
if (tagKey.equals(Constant.META_COL_HOSTNAME)) {
metaMap.put("C_HostName", logtag.getValue());
} else if (tagKey.equals(Constant.META_COL_PATH)) {
metaMap.put("C_Path", logtag.getValue());
}
metaMap.put(tagKey, tagValue);
allMetaJson.put(tagKey, tagValue);
}
for (int lIdx = 0; lIdx < flg.getLogsCount(); ++lIdx) {
dataMap.clear();
FastLog log = flg.getLogs(lIdx);
String logTime = String.valueOf(log.getTime());
metaMap.put("C_LogTime", logTime);
metaMap.put(Constant.META_COL_LOGTIME, logTime);
allMetaJson.put(Constant.META_COL_LOGTIME, logTime);
for (int cIdx = 0; cIdx < log.getContentsCount(); ++cIdx) {
FastLogContent content = log.getContents(cIdx);
dataMap.put(content.getKey(), content.getValue());
}
Record record = recordSender.createRecord();
JSONObject extractOthers = new JSONObject();
if(columns.contains(Constant.COL_EXTRACT_OTHERS)){
List<String> keyList = Arrays.asList(dataMap.keySet().toArray(new String[dataMap.keySet().size()]));
for (String otherKey:keyList) {
if (!columns.contains(otherKey)){
extractOthers.put(otherKey,dataMap.get(otherKey));
}
}
}
if (null != this.columns && 1 == this.columns.size()) {
String columnsInStr = columns.get(0).toString();
if ("\"*\"".equals(columnsInStr) || "*".equals(columnsInStr)) {
List<String> keyList = Arrays.asList(dataMap.keySet().toArray(new String[dataMap.keySet().size()]));
Collections.sort(keyList);
for (String key : keyList) {
record.addColumn(new StringColumn(key + ":" + dataMap.get(key)));
}
} else {
if (dataMap.containsKey(columnsInStr)) {
record.addColumn(new StringColumn(dataMap.get(columnsInStr)));
} else if (metaMap.containsKey(columnsInStr)) {
record.addColumn(new StringColumn(metaMap.get(columnsInStr)));
} else if (Constant.COL_EXTRACT_OTHERS.equals(columnsInStr)){
record.addColumn(new StringColumn(extractOthers.toJSONString()));
} else if (Constant.COL_EXTRACT_ALL_META.equals(columnsInStr)) {
record.addColumn(new StringColumn(allMetaJson.toJSONString()));
}
}
} else {
for (String col : this.columns) {
if (dataMap.containsKey(col)) {
record.addColumn(new StringColumn(dataMap.get(col)));
} else if (metaMap.containsKey(col)) {
record.addColumn(new StringColumn(metaMap.get(col)));
} else if (col != null && col.startsWith("'") && col.endsWith("'")){
String constant = col.substring(1, col.length()-1);
record.addColumn(new StringColumn(constant));
}else if (Constant.COL_EXTRACT_OTHERS.equals(col)){
record.addColumn(new StringColumn(extractOthers.toJSONString()));
} else if (Constant.COL_EXTRACT_ALL_META.equals(col)) {
record.addColumn(new StringColumn(allMetaJson.toJSONString()));
} else {
record.addColumn(new StringColumn(null));
}
}
}
recordSender.sendToWriter(record);
}
}
nextCursor = logDataRes.GetNextCursor();
}
} catch (LogException e) {
if (e.GetErrorCode().equals("LogStoreNotExist")) {
LOG.info("logStore[" + logstore +"] Not Exits! detail error messsage: " + e.toString());
} else {
LOG.error("read LogStore[" + logstore + "] error, please check ! detail error messsage: " + e.toString());
throw DataXException.asDataXException(LogHubReaderErrorCode.LOG_HUB_ERROR, e);
}
} catch (Exception e) {
LOG.error("read LogStore[" + logstore + "] error, please check ! detail error messsage: " + e.toString());
throw DataXException.asDataXException(LogHubReaderErrorCode.LOG_HUB_ERROR, e);
}
LOG.info("end read loghub shard...");
}
@Override
public void post() {
}
@Override
public void destroy() {
}
}
}

View File

@ -0,0 +1,34 @@
package com.alibaba.datax.plugin.reader.loghubreader;
import com.alibaba.datax.common.spi.ErrorCode;
public enum LogHubReaderErrorCode implements ErrorCode {
BAD_CONFIG_VALUE("LogHuReader-00", "The value you configured is invalid."),
LOG_HUB_ERROR("LogHubReader-01","LogHub access encounter exception"),
REQUIRE_VALUE("LogHubReader-02","Missing parameters"),
EMPTY_LOGSTORE_VALUE("LogHubReader-03","There is no shard in this LogStore");
private final String code;
private final String description;
private LogHubReaderErrorCode(String code, String description) {
this.code = code;
this.description = description;
}
@Override
public String getCode() {
return this.code;
}
@Override
public String getDescription() {
return this.description;
}
@Override
public String toString() {
return String.format("Code:[%s], Description:[%s]. ", this.code,
this.description);
}
}

View File

@ -0,0 +1,6 @@
{
"name": "loghubreader",
"class": "com.alibaba.datax.plugin.reader.loghubreader.LogHubReader",
"description": "适用于: 从SLS LogHub中读取数据",
"developer": "alibaba"
}

View File

@ -0,0 +1,12 @@
{
"name": "loghubreader",
"parameter": {
"endpoint": "",
"accessId": "",
"accessKey": "",
"project": "",
"logstore": "",
"batchSize":1024,
"column": []
}
}

73
loghubwriter/pom.xml Normal file
View File

@ -0,0 +1,73 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>datax-all</artifactId>
<groupId>com.alibaba.datax</groupId>
<version>0.0.1-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>loghubwriter</artifactId>
<version>0.0.1-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>com.alibaba.datax</groupId>
<artifactId>datax-common</artifactId>
<version>${datax-project-version}</version>
<exclusions>
<exclusion>
<artifactId>slf4j-log4j12</artifactId>
<groupId>org.slf4j</groupId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
</dependency>
<dependency>
<groupId>ch.qos.logback</groupId>
<artifactId>logback-classic</artifactId>
</dependency>
<dependency>
<groupId>com.aliyun.openservices</groupId>
<artifactId>aliyun-log</artifactId>
<version>0.6.12</version>
</dependency>
</dependencies>
<build>
<plugins>
<!-- compiler plugin -->
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>${jdk-version}</source>
<target>${jdk-version}</target>
<encoding>${project-sourceEncoding}</encoding>
</configuration>
</plugin>
<!-- assembly plugin -->
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<descriptors>
<descriptor>src/main/assembly/package.xml</descriptor>
</descriptors>
<finalName>datax</finalName>
</configuration>
<executions>
<execution>
<id>dwzip</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>

View File

@ -0,0 +1,34 @@
<assembly
xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0 http://maven.apache.org/xsd/assembly-1.1.0.xsd">
<id></id>
<formats>
<format>dir</format>
</formats>
<includeBaseDirectory>false</includeBaseDirectory>
<fileSets>
<fileSet>
<directory>src/main/resources</directory>
<includes>
<include>plugin.json</include>
</includes>
<outputDirectory>plugin/writer/loghubwriter</outputDirectory>
</fileSet>
<fileSet>
<directory>target/</directory>
<includes>
<include>loghubwriter-0.0.1-SNAPSHOT.jar</include>
</includes>
<outputDirectory>plugin/writer/loghubwriter</outputDirectory>
</fileSet>
</fileSets>
<dependencySets>
<dependencySet>
<useProjectArtifact>false</useProjectArtifact>
<outputDirectory>plugin/writer/loghubwriter/libs</outputDirectory>
<scope>runtime</scope>
</dependencySet>
</dependencySets>
</assembly>

View File

@ -0,0 +1,35 @@
package com.alibaba.datax.plugin.writer.loghubwriter;
/**
* 配置关键字
* @author
*/
public final class Key {
/**
* 此处声明插件用到的需要插件使用者提供的配置项
*/
public static final String ENDPOINT = "endpoint";
public static final String ACCESS_KEY_ID = "accessId";
public static final String ACCESS_KEY_SECRET = "accessKey";
public static final String PROJECT = "project";
public static final String LOG_STORE = "logstore";
public static final String TOPIC = "topic";
public static final String COLUMN = "column";
public static final String BATCH_SIZE = "batchSize";
public static final String TIME = "time";
public static final String TIME_FORMAT = "timeformat";
public static final String SOURCE = "source";
public static final String HASH_BY_KEY = "hashKey";
}

View File

@ -0,0 +1,315 @@
package com.alibaba.datax.plugin.writer.loghubwriter;
import com.alibaba.datax.common.element.Record;
import com.alibaba.datax.common.exception.DataXException;
import com.alibaba.datax.common.plugin.RecordReceiver;
import com.alibaba.datax.common.spi.Writer;
import com.alibaba.datax.common.util.Configuration;
import com.alibaba.datax.common.util.RetryUtil;
import com.alibaba.datax.common.util.StrUtil;
import com.aliyun.openservices.log.Client;
import com.aliyun.openservices.log.common.LogItem;
import com.aliyun.openservices.log.common.Shard;
import com.aliyun.openservices.log.exception.LogException;
import com.aliyun.openservices.log.request.ListShardRequest;
import com.aliyun.openservices.log.request.PutLogsRequest;
import com.aliyun.openservices.log.response.ListShardResponse;
import com.aliyun.openservices.log.response.PutLogsResponse;
import org.apache.commons.codec.digest.Md5Crypt;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import sun.security.provider.MD5;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Callable;
/**
* SLS 写插件
* @author
*/
public class LogHubWriter extends Writer {
public static class Job extends Writer.Job {
private static final Logger LOG = LoggerFactory.getLogger(Job.class);
private Configuration jobConfig = null;
@Override
public void init() {
info(LOG, "loghub writer job init begin ...");
this.jobConfig = super.getPluginJobConf();
validateParameter(jobConfig);
info(LOG, "loghub writer job init end.");
}
private void validateParameter(Configuration conf){
conf.getNecessaryValue(Key.ENDPOINT,LogHubWriterErrorCode.REQUIRE_VALUE);
conf.getNecessaryValue(Key.ACCESS_KEY_ID,LogHubWriterErrorCode.REQUIRE_VALUE);
conf.getNecessaryValue(Key.ACCESS_KEY_SECRET,LogHubWriterErrorCode.REQUIRE_VALUE);
conf.getNecessaryValue(Key.PROJECT,LogHubWriterErrorCode.REQUIRE_VALUE);
conf.getNecessaryValue(Key.LOG_STORE,LogHubWriterErrorCode.REQUIRE_VALUE);
conf.getNecessaryValue(Key.COLUMN,LogHubWriterErrorCode.REQUIRE_VALUE);
}
@Override
public List<Configuration> split(int mandatoryNumber) {
info(LOG, "split begin...");
List<Configuration> configurationList = new ArrayList<Configuration>();
for (int i = 0; i < mandatoryNumber; i++) {
configurationList.add(this.jobConfig.clone());
}
info(LOG, "split end...");
return configurationList;
}
@Override
public void post() {
}
@Override
public void destroy() {
}
}
public static class Task extends Writer.Task {
private static final Logger LOG = LoggerFactory.getLogger(Task.class);
private Configuration taskConfig;
private com.aliyun.openservices.log.Client logHubClient;
private String logStore;
private String topic;
private String project;
private List<String> columnList;
private int batchSize;
private String timeCol;
private String timeFormat;
private String source;
private boolean isHashKey;
private List<Shard> shards;
public void init() {
this.taskConfig = super.getPluginJobConf();
String endpoint = taskConfig.getString(Key.ENDPOINT);
String accessKeyId = taskConfig.getString(Key.ACCESS_KEY_ID);
String accessKeySecret = taskConfig.getString(Key.ACCESS_KEY_SECRET);
project = taskConfig.getString(Key.PROJECT);
logStore = taskConfig.getString(Key.LOG_STORE);
topic = taskConfig.getString(Key.TOPIC,"");
columnList = taskConfig.getList(Key.COLUMN,String.class);
batchSize = taskConfig.getInt(Key.BATCH_SIZE,1024);
timeCol = taskConfig.getString(Key.TIME,"");
timeFormat = taskConfig.getString(Key.TIME_FORMAT,"");
source = taskConfig.getString(Key.SOURCE,"");
isHashKey = taskConfig.getBool(Key.HASH_BY_KEY,false);
logHubClient = new Client(endpoint, accessKeyId, accessKeySecret);
if (isHashKey) {
listShard();
info(LOG, "init loghub writer with hash key mode.");
}
if (LOG.isInfoEnabled()) {
LOG.info("init loghub writer task finished.project:{} logstore:{} topic:{} batchSize:{}",project,logStore,topic,batchSize);
}
}
/**
* 获取通道的分片信息
*/
private void listShard() {
try {
ListShardResponse response = logHubClient.ListShard(new ListShardRequest(project,logStore));
shards = response.GetShards();
if (LOG.isInfoEnabled()) {
LOG.info("Get shard count:{}", shards.size());
}
} catch (LogException e) {
info(LOG, "Get shard failed!");
throw new RuntimeException("Get shard failed!", e);
}
}
@Override
public void prepare() {
}
private int getTime(String v) {
try {
if ("bigint".equalsIgnoreCase(timeFormat)) {
return Integer.valueOf(v);
}
DateFormat sdf = new SimpleDateFormat(timeFormat);
Date date = sdf.parse(v);
return (int)(date.getTime()/1000);
} catch (Exception e) {
LOG.warn("Format time failed!", e);
}
return (int)(((new Date())).getTime()/1000);
}
@Override
public void startWrite(RecordReceiver recordReceiver) {
info(LOG, "start to write.....................");
// 按照shared做hash处理
if (isHashKey) {
processDataWithHashKey(recordReceiver);
} else {
processDataWithoutHashKey(recordReceiver);
}
info(LOG, "finish to write.........");
}
private void processDataWithHashKey(RecordReceiver receiver) {
Record record;
Map<String, List<LogItem>> logMap = new HashMap<String, List<LogItem>>(shards.size());
int count = 0;
try {
while ((record = receiver.getFromReader()) != null) {
LogItem logItem = new LogItem();
if (record.getColumnNumber() != columnList.size()) {
this.getTaskPluginCollector().collectDirtyRecord(record, "column not match");
}
String id = "";
for (int i = 0; i < record.getColumnNumber(); i++) {
String colName = columnList.get(i);
String colValue = record.getColumn(i).asString();
if (colName.endsWith("_id")) {
id = colValue;
}
logItem.PushBack(colName, colValue);
if (colName.equals(timeCol)) {
logItem.SetTime(getTime(colValue));
}
}
String hashKey = getShardHashKey(StrUtil.getMd5(id), shards);
if (!logMap.containsKey(hashKey)) {
info(LOG, "Hash key:" + hashKey);
logMap.put(hashKey, new ArrayList<LogItem>());
}
logMap.get(hashKey).add(logItem);
if (logMap.get(hashKey).size() % batchSize == 0) {
PutLogsRequest request = new PutLogsRequest(project, logStore, topic, source, logMap.get(hashKey), hashKey);
PutLogsResponse response = putLog(request);
count += logMap.get(hashKey).size();
if (LOG.isDebugEnabled()) {
LOG.debug("record count:{}, request id:{}", logMap.get(hashKey).size(), response.GetRequestId());
}
logMap.get(hashKey).clear();
}
}
for (Map.Entry<String, List<LogItem>> entry : logMap.entrySet()) {
if (!entry.getValue().isEmpty()) {
// 将剩余的数据发送
PutLogsRequest request = new PutLogsRequest(project, logStore, topic, source, entry.getValue(), entry.getKey());
PutLogsResponse response = putLog(request);
count += entry.getValue().size();
if (LOG.isDebugEnabled()) {
LOG.debug("record count:{}, request id:{}", entry.getValue().size(), response.GetRequestId());
}
entry.getValue().clear();
}
}
LOG.info("{} records have been sent", count);
} catch (LogException ex) {
throw DataXException.asDataXException(LogHubWriterErrorCode.LOG_HUB_ERROR, ex.getMessage(), ex);
} catch (Exception e) {
throw DataXException.asDataXException(LogHubWriterErrorCode.LOG_HUB_ERROR, e.getMessage(), e);
}
}
private void processDataWithoutHashKey(RecordReceiver receiver) {
Record record;
ArrayList<LogItem> logGroup = new ArrayList<LogItem>();
int count = 0;
try {
while ((record = receiver.getFromReader()) != null) {
LogItem logItem = new LogItem();
if(record.getColumnNumber() != columnList.size()){
this.getTaskPluginCollector().collectDirtyRecord(record,"column not match");
}
for (int i = 0; i < record.getColumnNumber(); i++) {
String colName = columnList.get(i);
String colValue = record.getColumn(i).asString();
logItem.PushBack(colName, colValue);
if(colName.equals(timeCol)){
logItem.SetTime(getTime(colValue));
}
}
logGroup.add(logItem);
count++;
if (count % batchSize == 0) {
PutLogsRequest request = new PutLogsRequest(project, logStore, topic, source, logGroup);
PutLogsResponse response = putLog(request);
logGroup.clear();
if (LOG.isDebugEnabled()) {
LOG.debug("record count:{}, request id:{}", count, response.GetRequestId());
}
}
}
if (!logGroup.isEmpty()) {
//将剩余的数据发送
PutLogsRequest request = new PutLogsRequest(project, logStore, topic, source, logGroup);
PutLogsResponse response = putLog(request);
logGroup.clear();
if (LOG.isDebugEnabled()) {
LOG.debug("record count:{}, request id:{}", count, response.GetRequestId());
}
}
LOG.info("{} records have been sent", count);
} catch (LogException ex) {
throw DataXException.asDataXException(LogHubWriterErrorCode.LOG_HUB_ERROR, ex.getMessage(), ex);
} catch (Exception e) {
throw DataXException.asDataXException(LogHubWriterErrorCode.LOG_HUB_ERROR, e.getMessage(), e);
}
}
private PutLogsResponse putLog(final PutLogsRequest request) throws Exception{
final Client client = this.logHubClient;
return RetryUtil.executeWithRetry(new Callable<PutLogsResponse>() {
public PutLogsResponse call() throws LogException{
return client.PutLogs(request);
}
}, 3, 1000L, false);
}
private String getShardHashKey(String hashKey, List<Shard> shards) {
for (Shard shard : shards) {
if (hashKey.compareTo(shard.getExclusiveEndKey()) < 0 && hashKey.compareTo(shard.getInclusiveBeginKey()) >= 0) {
return shard.getInclusiveBeginKey();
}
}
return shards.get(0).getInclusiveBeginKey();
}
@Override
public void post() {
}
@Override
public void destroy() {
}
}
/**
* 日志打印控制
*
* @param logger
* @param message
*/
public static void info(Logger logger, String message) {
if (logger.isInfoEnabled()) {
logger.info(message);
}
}
}

Some files were not shown because too many files have changed in this diff Show More