Merge pull request #1 from alibaba/master

123
This commit is contained in:
gfbfabcd123 2023-11-05 02:40:29 +08:00 committed by GitHub
commit ec7bfc0aa4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
291 changed files with 16226 additions and 3010 deletions

107
README.md
View File

@ -26,7 +26,7 @@ DataX本身作为数据同步框架将不同数据源的同步抽象为从源
# Quick Start
##### Download [DataX下载地址](https://datax-opensource.oss-cn-hangzhou.aliyuncs.com/202303/datax.tar.gz)
##### Download [DataX下载地址](https://datax-opensource.oss-cn-hangzhou.aliyuncs.com/202308/datax.tar.gz)
##### 请点击:[Quick Start](https://github.com/alibaba/DataX/blob/master/userGuid.md)
@ -37,47 +37,48 @@ DataX本身作为数据同步框架将不同数据源的同步抽象为从源
DataX目前已经有了比较全面的插件体系主流的RDBMS数据库、NOSQL、大数据计算系统都已经接入目前支持数据如下图详情请点击[DataX数据源参考指南](https://github.com/alibaba/DataX/wiki/DataX-all-data-channels)
| 类型 | 数据源 | Reader(读) | Writer(写) | 文档 |
| 类型 | 数据源 | Reader(读) | Writer(写) | 文档 |
|--------------|---------------------------|:---------:|:---------:|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|
| RDBMS 关系型数据库 | MySQL | √ | √ | [](https://github.com/alibaba/DataX/blob/master/mysqlreader/doc/mysqlreader.md) 、[写](https://github.com/alibaba/DataX/blob/master/mysqlwriter/doc/mysqlwriter.md) |
| | Oracle | √ | √ | [](https://github.com/alibaba/DataX/blob/master/oraclereader/doc/oraclereader.md) 、[写](https://github.com/alibaba/DataX/blob/master/oraclewriter/doc/oraclewriter.md) |
| | OceanBase | √ | √ | [](https://open.oceanbase.com/docs/community/oceanbase-database/V3.1.0/use-datax-to-full-migration-data-to-oceanbase) 、[写](https://open.oceanbase.com/docs/community/oceanbase-database/V3.1.0/use-datax-to-full-migration-data-to-oceanbase) |
| | SQLServer | √ | √ | [](https://github.com/alibaba/DataX/blob/master/sqlserverreader/doc/sqlserverreader.md) 、[写](https://github.com/alibaba/DataX/blob/master/sqlserverwriter/doc/sqlserverwriter.md) |
| | PostgreSQL | √ | √ | [](https://github.com/alibaba/DataX/blob/master/postgresqlreader/doc/postgresqlreader.md) 、[写](https://github.com/alibaba/DataX/blob/master/postgresqlwriter/doc/postgresqlwriter.md) |
| | DRDS | √ | √ | [](https://github.com/alibaba/DataX/blob/master/drdsreader/doc/drdsreader.md) 、[写](https://github.com/alibaba/DataX/blob/master/drdswriter/doc/drdswriter.md) |
| | Kingbase | √ | √ | [](https://github.com/alibaba/DataX/blob/master/drdsreader/doc/drdsreader.md) 、[写](https://github.com/alibaba/DataX/blob/master/drdswriter/doc/drdswriter.md) |
| | 通用RDBMS(支持所有关系型数据库) | √ | √ | [](https://github.com/alibaba/DataX/blob/master/rdbmsreader/doc/rdbmsreader.md) 、[写](https://github.com/alibaba/DataX/blob/master/rdbmswriter/doc/rdbmswriter.md) |
| 阿里云数仓数据存储 | ODPS | √ | √ | [](https://github.com/alibaba/DataX/blob/master/odpsreader/doc/odpsreader.md) 、[写](https://github.com/alibaba/DataX/blob/master/odpswriter/doc/odpswriter.md) |
| | ADB | | √ | [](https://github.com/alibaba/DataX/blob/master/adbmysqlwriter/doc/adbmysqlwriter.md) |
| | ADS | | √ | [](https://github.com/alibaba/DataX/blob/master/adswriter/doc/adswriter.md) |
| | OSS | √ | √ | [](https://github.com/alibaba/DataX/blob/master/ossreader/doc/ossreader.md) 、[写](https://github.com/alibaba/DataX/blob/master/osswriter/doc/osswriter.md) |
| | OCS | | √ | [](https://github.com/alibaba/DataX/blob/master/ocswriter/doc/ocswriter.md) |
| | Hologres | | √ | [](https://github.com/alibaba/DataX/blob/master/hologresjdbcwriter/doc/hologresjdbcwriter.md) |
| | AnalyticDB For PostgreSQL | | √ | 写 |
| 阿里云中间件 | datahub | √ | √ | 读 、写 |
| | SLS | √ | √ | 读 、写 |
| 阿里云图数据库 | GDB | √ | √ | [](https://github.com/alibaba/DataX/blob/master/gdbreader/doc/gdbreader.md) 、[写](https://github.com/alibaba/DataX/blob/master/gdbwriter/doc/gdbwriter.md) |
| NoSQL数据存储 | OTS | √ | √ | [](https://github.com/alibaba/DataX/blob/master/otsreader/doc/otsreader.md) 、[写](https://github.com/alibaba/DataX/blob/master/otswriter/doc/otswriter.md) |
| | Hbase0.94 | √ | √ | [](https://github.com/alibaba/DataX/blob/master/hbase094xreader/doc/hbase094xreader.md) 、[写](https://github.com/alibaba/DataX/blob/master/hbase094xwriter/doc/hbase094xwriter.md) |
| | Hbase1.1 | √ | √ | [](https://github.com/alibaba/DataX/blob/master/hbase11xreader/doc/hbase11xreader.md) 、[写](https://github.com/alibaba/DataX/blob/master/hbase11xwriter/doc/hbase11xwriter.md) |
| | Phoenix4.x | √ | √ | [](https://github.com/alibaba/DataX/blob/master/hbase11xsqlreader/doc/hbase11xsqlreader.md) 、[写](https://github.com/alibaba/DataX/blob/master/hbase11xsqlwriter/doc/hbase11xsqlwriter.md) |
| | Phoenix5.x | √ | √ | [](https://github.com/alibaba/DataX/blob/master/hbase20xsqlreader/doc/hbase20xsqlreader.md) 、[写](https://github.com/alibaba/DataX/blob/master/hbase20xsqlwriter/doc/hbase20xsqlwriter.md) |
| | MongoDB | √ | √ | [](https://github.com/alibaba/DataX/blob/master/mongodbreader/doc/mongodbreader.md) 、[写](https://github.com/alibaba/DataX/blob/master/mongodbwriter/doc/mongodbwriter.md) |
| | Cassandra | √ | √ | [](https://github.com/alibaba/DataX/blob/master/cassandrareader/doc/cassandrareader.md) 、[写](https://github.com/alibaba/DataX/blob/master/cassandrawriter/doc/cassandrawriter.md) |
| 数仓数据存储 | StarRocks | √ | √ | 读 、[写](https://github.com/alibaba/DataX/blob/master/starrockswriter/doc/starrockswriter.md) |
| | ApacheDoris | | √ | [](https://github.com/alibaba/DataX/blob/master/doriswriter/doc/doriswriter.md) |
| | ClickHouse | | √ | 写 |
| | Databend | | √ | [](https://github.com/alibaba/DataX/blob/master/databendwriter/doc/databendwriter.md) |
| | Hive | √ | √ | [](https://github.com/alibaba/DataX/blob/master/hdfsreader/doc/hdfsreader.md) 、[写](https://github.com/alibaba/DataX/blob/master/hdfswriter/doc/hdfswriter.md) |
| | kudu | | √ | [](https://github.com/alibaba/DataX/blob/master/hdfswriter/doc/hdfswriter.md) |
| | selectdb | | √ | [](https://github.com/alibaba/DataX/blob/master/selectdbwriter/doc/selectdbwriter.md) |
| 无结构化数据存储 | TxtFile | √ | √ | [](https://github.com/alibaba/DataX/blob/master/txtfilereader/doc/txtfilereader.md) 、[写](https://github.com/alibaba/DataX/blob/master/txtfilewriter/doc/txtfilewriter.md) |
| | FTP | √ | √ | [](https://github.com/alibaba/DataX/blob/master/ftpreader/doc/ftpreader.md) 、[写](https://github.com/alibaba/DataX/blob/master/ftpwriter/doc/ftpwriter.md) |
| | HDFS | √ | √ | [](https://github.com/alibaba/DataX/blob/master/hdfsreader/doc/hdfsreader.md) 、[写](https://github.com/alibaba/DataX/blob/master/hdfswriter/doc/hdfswriter.md) |
| | Elasticsearch | | √ | [](https://github.com/alibaba/DataX/blob/master/elasticsearchwriter/doc/elasticsearchwriter.md) |
| 时间序列数据库 | OpenTSDB | √ | | [](https://github.com/alibaba/DataX/blob/master/opentsdbreader/doc/opentsdbreader.md) |
| | TSDB | √ | √ | [](https://github.com/alibaba/DataX/blob/master/tsdbreader/doc/tsdbreader.md) 、[写](https://github.com/alibaba/DataX/blob/master/tsdbwriter/doc/tsdbhttpwriter.md) |
| | TDengine | √ | √ | [](https://github.com/alibaba/DataX/blob/master/tdenginereader/doc/tdenginereader-CN.md) 、[写](https://github.com/alibaba/DataX/blob/master/tdenginewriter/doc/tdenginewriter-CN.md) |
| RDBMS 关系型数据库 | MySQL | √ | √ | [](https://github.com/alibaba/DataX/blob/master/mysqlreader/doc/mysqlreader.md) 、[写](https://github.com/alibaba/DataX/blob/master/mysqlwriter/doc/mysqlwriter.md) |
| | Oracle | √ | √ | [](https://github.com/alibaba/DataX/blob/master/oraclereader/doc/oraclereader.md) 、[写](https://github.com/alibaba/DataX/blob/master/oraclewriter/doc/oraclewriter.md) |
| | OceanBase | √ | √ | [](https://open.oceanbase.com/docs/community/oceanbase-database/V3.1.0/use-datax-to-full-migration-data-to-oceanbase) 、[写](https://open.oceanbase.com/docs/community/oceanbase-database/V3.1.0/use-datax-to-full-migration-data-to-oceanbase) |
| | SQLServer | √ | √ | [](https://github.com/alibaba/DataX/blob/master/sqlserverreader/doc/sqlserverreader.md) 、[写](https://github.com/alibaba/DataX/blob/master/sqlserverwriter/doc/sqlserverwriter.md) |
| | PostgreSQL | √ | √ | [](https://github.com/alibaba/DataX/blob/master/postgresqlreader/doc/postgresqlreader.md) 、[写](https://github.com/alibaba/DataX/blob/master/postgresqlwriter/doc/postgresqlwriter.md) |
| | DRDS | √ | √ | [](https://github.com/alibaba/DataX/blob/master/drdsreader/doc/drdsreader.md) 、[写](https://github.com/alibaba/DataX/blob/master/drdswriter/doc/drdswriter.md) |
| | Kingbase | √ | √ | [](https://github.com/alibaba/DataX/blob/master/drdsreader/doc/drdsreader.md) 、[写](https://github.com/alibaba/DataX/blob/master/drdswriter/doc/drdswriter.md) |
| | 通用RDBMS(支持所有关系型数据库) | √ | √ | [](https://github.com/alibaba/DataX/blob/master/rdbmsreader/doc/rdbmsreader.md) 、[写](https://github.com/alibaba/DataX/blob/master/rdbmswriter/doc/rdbmswriter.md) |
| 阿里云数仓数据存储 | ODPS | √ | √ | [](https://github.com/alibaba/DataX/blob/master/odpsreader/doc/odpsreader.md) 、[写](https://github.com/alibaba/DataX/blob/master/odpswriter/doc/odpswriter.md) |
| | ADB | | √ | [](https://github.com/alibaba/DataX/blob/master/adbmysqlwriter/doc/adbmysqlwriter.md) |
| | ADS | | √ | [](https://github.com/alibaba/DataX/blob/master/adswriter/doc/adswriter.md) |
| | OSS | √ | √ | [](https://github.com/alibaba/DataX/blob/master/ossreader/doc/ossreader.md) 、[写](https://github.com/alibaba/DataX/blob/master/osswriter/doc/osswriter.md) |
| | OCS | | √ | [](https://github.com/alibaba/DataX/blob/master/ocswriter/doc/ocswriter.md) |
| | Hologres | | √ | [](https://github.com/alibaba/DataX/blob/master/hologresjdbcwriter/doc/hologresjdbcwriter.md) |
| | AnalyticDB For PostgreSQL | | √ | 写 |
| 阿里云中间件 | datahub | √ | √ | 读 、写 |
| | SLS | √ | √ | 读 、写 |
| 图数据库 | 阿里云 GDB | √ | √ | [](https://github.com/alibaba/DataX/blob/master/gdbreader/doc/gdbreader.md) 、[写](https://github.com/alibaba/DataX/blob/master/gdbwriter/doc/gdbwriter.md) |
| | Neo4j | | √ | [](https://github.com/alibaba/DataX/blob/master/neo4jwriter/doc/neo4jwriter.md) |
| NoSQL数据存储 | OTS | √ | √ | [](https://github.com/alibaba/DataX/blob/master/otsreader/doc/otsreader.md) 、[写](https://github.com/alibaba/DataX/blob/master/otswriter/doc/otswriter.md) |
| | Hbase0.94 | √ | √ | [](https://github.com/alibaba/DataX/blob/master/hbase094xreader/doc/hbase094xreader.md) 、[写](https://github.com/alibaba/DataX/blob/master/hbase094xwriter/doc/hbase094xwriter.md) |
| | Hbase1.1 | √ | √ | [](https://github.com/alibaba/DataX/blob/master/hbase11xreader/doc/hbase11xreader.md) 、[写](https://github.com/alibaba/DataX/blob/master/hbase11xwriter/doc/hbase11xwriter.md) |
| | Phoenix4.x | √ | √ | [](https://github.com/alibaba/DataX/blob/master/hbase11xsqlreader/doc/hbase11xsqlreader.md) 、[写](https://github.com/alibaba/DataX/blob/master/hbase11xsqlwriter/doc/hbase11xsqlwriter.md) |
| | Phoenix5.x | √ | √ | [](https://github.com/alibaba/DataX/blob/master/hbase20xsqlreader/doc/hbase20xsqlreader.md) 、[写](https://github.com/alibaba/DataX/blob/master/hbase20xsqlwriter/doc/hbase20xsqlwriter.md) |
| | MongoDB | √ | √ | [](https://github.com/alibaba/DataX/blob/master/mongodbreader/doc/mongodbreader.md) 、[写](https://github.com/alibaba/DataX/blob/master/mongodbwriter/doc/mongodbwriter.md) |
| | Cassandra | √ | √ | [](https://github.com/alibaba/DataX/blob/master/cassandrareader/doc/cassandrareader.md) 、[写](https://github.com/alibaba/DataX/blob/master/cassandrawriter/doc/cassandrawriter.md) |
| 数仓数据存储 | StarRocks | √ | √ | 读 、[写](https://github.com/alibaba/DataX/blob/master/starrockswriter/doc/starrockswriter.md) |
| | ApacheDoris | | √ | [](https://github.com/alibaba/DataX/blob/master/doriswriter/doc/doriswriter.md) |
| | ClickHouse | √ | √ | [](https://github.com/alibaba/DataX/blob/master/clickhousereader/doc/clickhousereader.md) 、[写](https://github.com/alibaba/DataX/blob/master/clickhousewriter/doc/clickhousewriter.md) |
| | Databend | | √ | [](https://github.com/alibaba/DataX/blob/master/databendwriter/doc/databendwriter.md) |
| | Hive | √ | √ | [](https://github.com/alibaba/DataX/blob/master/hdfsreader/doc/hdfsreader.md) 、[写](https://github.com/alibaba/DataX/blob/master/hdfswriter/doc/hdfswriter.md) |
| | kudu | | √ | [](https://github.com/alibaba/DataX/blob/master/hdfswriter/doc/hdfswriter.md) |
| | selectdb | | √ | [](https://github.com/alibaba/DataX/blob/master/selectdbwriter/doc/selectdbwriter.md) |
| 无结构化数据存储 | TxtFile | √ | √ | [](https://github.com/alibaba/DataX/blob/master/txtfilereader/doc/txtfilereader.md) 、[写](https://github.com/alibaba/DataX/blob/master/txtfilewriter/doc/txtfilewriter.md) |
| | FTP | √ | √ | [](https://github.com/alibaba/DataX/blob/master/ftpreader/doc/ftpreader.md) 、[写](https://github.com/alibaba/DataX/blob/master/ftpwriter/doc/ftpwriter.md) |
| | HDFS | √ | √ | [](https://github.com/alibaba/DataX/blob/master/hdfsreader/doc/hdfsreader.md) 、[写](https://github.com/alibaba/DataX/blob/master/hdfswriter/doc/hdfswriter.md) |
| | Elasticsearch | | √ | [](https://github.com/alibaba/DataX/blob/master/elasticsearchwriter/doc/elasticsearchwriter.md) |
| 时间序列数据库 | OpenTSDB | √ | | [](https://github.com/alibaba/DataX/blob/master/opentsdbreader/doc/opentsdbreader.md) |
| | TSDB | √ | √ | [](https://github.com/alibaba/DataX/blob/master/tsdbreader/doc/tsdbreader.md) 、[写](https://github.com/alibaba/DataX/blob/master/tsdbwriter/doc/tsdbhttpwriter.md) |
| | TDengine | √ | √ | [](https://github.com/alibaba/DataX/blob/master/tdenginereader/doc/tdenginereader-CN.md) 、[写](https://github.com/alibaba/DataX/blob/master/tdenginewriter/doc/tdenginewriter-CN.md) |
# 阿里云DataWorks数据集成
@ -99,7 +100,7 @@ DataX目前已经有了比较全面的插件体系主流的RDBMS数据库、N
- 整库迁移https://help.aliyun.com/document_detail/137809.html
- 批量上云https://help.aliyun.com/document_detail/146671.html
- 更新更多能力请访问https://help.aliyun.com/document_detail/137663.html
-
# 我要开发新的插件
@ -109,6 +110,28 @@ DataX目前已经有了比较全面的插件体系主流的RDBMS数据库、N
DataX 后续计划月度迭代更新,也欢迎感兴趣的同学提交 Pull requests月度更新内容会介绍介绍如下。
- [datax_v202309]https://github.com/alibaba/DataX/releases/tag/datax_v202309)
- 支持Phoenix 同步数据添加 where条件
- 支持华为 GuassDB读写插件
- 修复ClickReader 插件运行报错 Can't find bundle for base name
- 增加 DataX调试模块
- 修复 orc空文件报错问题
- 优化obwriter性能
- txtfilewriter 增加导出为insert语句功能支持
- HdfsReader/HdfsWriter 支持parquet读写能力
- [datax_v202308]https://github.com/alibaba/DataX/releases/tag/datax_v202308)
- OTS 插件更新
- databend 插件更新
- Oceanbase驱动修复
- [datax_v202306]https://github.com/alibaba/DataX/releases/tag/datax_v202306)
- 精简代码
- 新增插件neo4jwriter、clickhousewriter
- 优化插件、修复问题oceanbase、hdfs、databend、txtfile
- [datax_v202303]https://github.com/alibaba/DataX/releases/tag/datax_v202303)
- 精简代码
- 新增插件adbmysqlwriter、databendwriter、selectdbwriter

View File

@ -0,0 +1,344 @@
# ClickhouseReader 插件文档
___
## 1 快速介绍
ClickhouseReader插件实现了从Clickhouse读取数据。在底层实现上ClickhouseReader通过JDBC连接远程Clickhouse数据库并执行相应的sql语句将数据从Clickhouse库中SELECT出来。
## 2 实现原理
简而言之ClickhouseReader通过JDBC连接器连接到远程的Clickhouse数据库并根据用户配置的信息生成查询SELECT SQL语句并发送到远程Clickhouse数据库并将该SQL执行返回结果使用DataX自定义的数据类型拼装为抽象的数据集并传递给下游Writer处理。
对于用户配置Table、Column、Where的信息ClickhouseReader将其拼接为SQL语句发送到Clickhouse数据库对于用户配置querySql信息Clickhouse直接将其发送到Clickhouse数据库。
## 3 功能说明
### 3.1 配置样例
* 配置一个从Clickhouse数据库同步抽取数据到本地的作业:
```
{
"job": {
"setting": {
"speed": {
//设置传输速度 byte/s 尽量逼近这个速度但是不高于它.
// channel 表示通道数量byte表示通道速度如果单通道速度1MB配置byte为1048576表示一个channel
"byte": 1048576
},
//出错限制
"errorLimit": {
//先选择record
"record": 0,
//百分比 1表示100%
"percentage": 0.02
}
},
"content": [
{
"reader": {
"name": "clickhousereader",
"parameter": {
// 数据库连接用户名
"username": "root",
// 数据库连接密码
"password": "root",
"column": [
"id","name"
],
"connection": [
{
"table": [
"table"
],
"jdbcUrl": [
"jdbc:clickhouse://[HOST_NAME]:PORT/[DATABASE_NAME]"
]
}
]
}
},
"writer": {
//writer类型
"name": "streamwriter",
// 是否打印内容
"parameter": {
"print": true
}
}
}
]
}
}
```
* 配置一个自定义SQL的数据库同步任务到本地内容的作业
```
{
"job": {
"setting": {
"speed": {
"channel": 5
}
},
"content": [
{
"reader": {
"name": "clickhousereader",
"parameter": {
"username": "root",
"password": "root",
"where": "",
"connection": [
{
"querySql": [
"select db_id,on_line_flag from db_info where db_id < 10"
],
"jdbcUrl": [
"jdbc:clickhouse://1.1.1.1:8123/default"
]
}
]
}
},
"writer": {
"name": "streamwriter",
"parameter": {
"visible": false,
"encoding": "UTF-8"
}
}
}
]
}
}
```
### 3.2 参数说明
* **jdbcUrl**
* 描述描述的是到对端数据库的JDBC连接信息使用JSON的数组描述并支持一个库填写多个连接地址。之所以使用JSON数组描述连接信息是因为阿里集团内部支持多个IP探测如果配置了多个ClickhouseReader可以依次探测ip的可连接性直到选择一个合法的IP。如果全部连接失败ClickhouseReader报错。 注意jdbcUrl必须包含在connection配置单元中。对于阿里集团外部使用情况JSON数组填写一个JDBC连接即可。
jdbcUrl按照Clickhouse官方规范并可以填写连接附件控制信息。具体请参看[Clickhouse官方文档](https://clickhouse.com/docs/en/engines/table-engines/integrations/jdbc)。
* 必选:是 <br />
* 默认值:无 <br />
* **username**
* 描述:数据源的用户名 <br />
* 必选:是 <br />
* 默认值:无 <br />
* **password**
* 描述:数据源指定用户名的密码 <br />
* 必选:是 <br />
* 默认值:无 <br />
* **table**
* 描述所选取的需要同步的表。使用JSON的数组描述因此支持多张表同时抽取。当配置为多张表时用户自己需保证多张表是同一schema结构ClickhouseReader不予检查表是否同一逻辑表。注意table必须包含在connection配置单元中。<br />
* 必选:是 <br />
* 默认值:无 <br />
* **column**
* 描述所配置的表中需要同步的列名集合使用JSON的数组描述字段信息。用户使用\*代表默认使用所有列配置,例如['\*']。
支持列裁剪,即列可以挑选部分列进行导出。
支持列换序即列可以不按照表schema信息进行导出。
支持常量配置用户需要按照JSON格式:
["id", "`table`", "1", "'bazhen.csy'", "null", "to_char(a + 1)", "2.3" , "true"]
id为普通列名\`table\`为包含保留在的列名1为整形数字常量'bazhen.csy'为字符串常量null为空指针to_char(a + 1)为表达式2.3为浮点数true为布尔值。
Column必须显示填写不允许为空
* 必选:是 <br />
* 默认值:无 <br />
* **splitPk**
* 描述ClickhouseReader进行数据抽取时如果指定splitPk表示用户希望使用splitPk代表的字段进行数据分片DataX因此会启动并发任务进行数据同步这样可以大大提供数据同步的效能。
推荐splitPk用户使用表主键因为表主键通常情况下比较均匀因此切分出来的分片也不容易出现数据热点。
目前splitPk仅支持整形数据切分`不支持浮点、日期等其他类型`。如果用户指定其他非支持类型ClickhouseReader将报错
splitPk如果不填写将视作用户不对单表进行切分ClickhouseReader使用单通道同步全量数据。
* 必选:否 <br />
* 默认值:无 <br />
* **where**
* 描述筛选条件MysqlReader根据指定的column、table、where条件拼接SQL并根据这个SQL进行数据抽取。在实际业务场景中往往会选择当天的数据进行同步可以将where条件指定为gmt_create > $bizdate 。注意不可以将where条件指定为limit 10limit不是SQL的合法where子句。<br />
where条件可以有效地进行业务增量同步。
* 必选:否 <br />
* 默认值:无 <br />
* **querySql**
* 描述在有些业务场景下where这一配置项不足以描述所筛选的条件用户可以通过该配置型来自定义筛选SQL。当用户配置了这一项之后DataX系统就会忽略tablecolumn这些配置型直接使用这个配置项的内容对数据进行筛选例如需要进行多表join后同步数据使用select a,b from table_a join table_b on table_a.id = table_b.id <br />
`当用户配置querySql时ClickhouseReader直接忽略table、column、where条件的配置`
* 必选:否 <br />
* 默认值:无 <br />
* **fetchSize**
* 描述该配置项定义了插件和数据库服务器端每次批量数据获取条数该值决定了DataX和服务器端的网络交互次数能够较大的提升数据抽取性能。<br />
`注意,该值过大(>2048)可能造成DataX进程OOM。`
* 必选:否 <br />
* 默认值1024 <br />
* **session**
* 描述:控制写入数据的时间格式,时区等的配置,如果表中有时间字段,配置该值以明确告知写入 clickhouse 的时间格式。通常配置的参数为NLS_DATE_FORMAT,NLS_TIME_FORMAT。其配置的值为 json 格式,例如:
```
"session": [
"alter session set NLS_DATE_FORMAT='yyyy-mm-dd hh24:mi:ss'",
"alter session set NLS_TIMESTAMP_FORMAT='yyyy-mm-dd hh24:mi:ss'",
"alter session set NLS_TIMESTAMP_TZ_FORMAT='yyyy-mm-dd hh24:mi:ss'",
"alter session set TIME_ZONE='US/Pacific'"
]
```
`(注意&quot;是 " 的转义字符串)`
* 必选:否 <br />
* 默认值:无 <br />
### 3.3 类型转换
目前ClickhouseReader支持大部分Clickhouse类型但也存在部分个别类型没有支持的情况请注意检查你的类型。
下面列出ClickhouseReader针对Clickhouse类型转换列表:
| DataX 内部类型| Clickhouse 数据类型 |
| -------- |--------------------------------------------------------------------------------------------|
| Long | UInt8, UInt16, UInt32, UInt64, UInt128, UInt256, Int8, Int16, Int32, Int64, Int128, Int256 |
| Double | Float32, Float64, Decimal |
| String | String, FixedString |
| Date | DATE, Date32, DateTime, DateTime64 |
| Boolean | Boolean |
| Bytes | BLOB,BFILE,RAW,LONG RAW |
请注意:
* `除上述罗列字段类型外,其他类型均不支持`
## 4 性能报告
### 4.1 环境准备
#### 4.1.1 数据特征
为了模拟线上真实数据我们设计两个Clickhouse数据表分别为:
#### 4.1.2 机器参数
* 执行DataX的机器参数为:
* Clickhouse数据库机器参数为:
### 4.2 测试报告
#### 4.2.1 表1测试报告
| 并发任务数| DataX速度(Rec/s)|DataX流量|网卡流量|DataX运行负载|DB运行负载|
|--------| --------|--------|--------|--------|--------|
|1| DataX 统计速度(Rec/s)|DataX统计流量|网卡流量|DataX运行负载|DB运行负载|
## 5 约束限制
### 5.1 主备同步数据恢复问题
主备同步问题指Clickhouse使用主从灾备备库从主库不间断通过binlog恢复数据。由于主备数据同步存在一定的时间差特别在于某些特定情况例如网络延迟等问题导致备库同步恢复的数据与主库有较大差别导致从备库同步的数据不是一份当前时间的完整镜像。
针对这个问题我们提供了preSql功能该功能待补充。
### 5.2 一致性约束
Clickhouse在数据存储划分中属于RDBMS系统对外可以提供强一致性数据查询接口。例如当一次同步任务启动运行过程中当该库存在其他数据写入方写入数据时ClickhouseReader完全不会获取到写入更新数据这是由于数据库本身的快照特性决定的。关于数据库快照特性请参看[MVCC Wikipedia](https://en.wikipedia.org/wiki/Multiversion_concurrency_control)
上述是在ClickhouseReader单线程模型下数据同步一致性的特性由于ClickhouseReader可以根据用户配置信息使用了并发数据抽取因此不能严格保证数据一致性当ClickhouseReader根据splitPk进行数据切分后会先后启动多个并发任务完成数据同步。由于多个并发任务相互之间不属于同一个读事务同时多个并发任务存在时间间隔。因此这份数据并不是`完整的`、`一致的`数据快照信息。
针对多线程的一致性快照需求,在技术上目前无法实现,只能从工程角度解决,工程化的方式存在取舍,我们提供几个解决思路给用户,用户可以自行选择:
1. 使用单线程同步,即不再进行数据切片。缺点是速度比较慢,但是能够很好保证一致性。
2. 关闭其他数据写入方,保证当前数据为静态数据,例如,锁表、关闭备库同步等等。缺点是可能影响在线业务。
### 5.3 数据库编码问题
ClickhouseReader底层使用JDBC进行数据抽取JDBC天然适配各类编码并在底层进行了编码转换。因此ClickhouseReader不需用户指定编码可以自动获取编码并转码。
对于Clickhouse底层写入编码和其设定的编码不一致的混乱情况ClickhouseReader对此无法识别对此也无法提供解决方案对于这类情况`导出有可能为乱码`。
### 5.4 增量数据同步
ClickhouseReader使用JDBC SELECT语句完成数据抽取工作因此可以使用SELECT...WHERE...进行增量数据抽取,方式有多种:
* 数据库在线应用写入数据库时填充modify字段为更改时间戳包括新增、更新、删除(逻辑删)。对于这类应用ClickhouseReader只需要WHERE条件跟上一同步阶段时间戳即可。
* 对于新增流水型数据ClickhouseReader可以WHERE条件后跟上一阶段最大自增ID即可。
对于业务上无字段区分新增、修改数据情况ClickhouseReader也无法进行增量数据同步只能同步全量数据。
### 5.5 Sql安全性
ClickhouseReader提供querySql语句交给用户自己实现SELECT抽取语句ClickhouseReader本身对querySql不做任何安全性校验。这块交由DataX用户方自己保证。
## 6 FAQ
***
**Q: ClickhouseReader同步报错报错信息为XXX**
A: 网络或者权限问题请使用Clickhouse命令行测试
如果上述命令也报错那可以证实是环境问题请联系你的DBA。
**Q: ClickhouseReader抽取速度很慢怎么办**
A: 影响抽取时间的原因大概有如下几个:(来自专业 DBA 卫绾)
1. 由于SQL的plan异常导致的抽取时间长 在抽取时,尽可能使用全表扫描代替索引扫描;
2. 合理sql的并发度减少抽取时间
3. 抽取sql要简单尽量不用replace等函数这个非常消耗cpu会严重影响抽取速度;

91
clickhousereader/pom.xml Normal file
View File

@ -0,0 +1,91 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>datax-all</artifactId>
<groupId>com.alibaba.datax</groupId>
<version>0.0.1-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>clickhousereader</artifactId>
<name>clickhousereader</name>
<packaging>jar</packaging>
<dependencies>
<dependency>
<groupId>ru.yandex.clickhouse</groupId>
<artifactId>clickhouse-jdbc</artifactId>
<version>0.2.4</version>
</dependency>
<dependency>
<groupId>com.alibaba.datax</groupId>
<artifactId>datax-core</artifactId>
<version>${datax-project-version}</version>
</dependency>
<dependency>
<groupId>com.alibaba.datax</groupId>
<artifactId>datax-common</artifactId>
<version>${datax-project-version}</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
</dependency>
<dependency>
<groupId>ch.qos.logback</groupId>
<artifactId>logback-classic</artifactId>
</dependency>
<dependency>
<groupId>com.alibaba.datax</groupId>
<artifactId>plugin-rdbms-util</artifactId>
<version>${datax-project-version}</version>
</dependency>
</dependencies>
<build>
<resources>
<resource>
<directory>src/main/java</directory>
<includes>
<include>**/*.properties</include>
</includes>
</resource>
</resources>
<plugins>
<!-- compiler plugin -->
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>${jdk-version}</source>
<target>${jdk-version}</target>
<encoding>${project-sourceEncoding}</encoding>
</configuration>
</plugin>
<!-- assembly plugin -->
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<descriptors>
<descriptor>src/main/assembly/package.xml</descriptor>
</descriptors>
<finalName>datax</finalName>
</configuration>
<executions>
<execution>
<id>dwzip</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>

View File

@ -0,0 +1,35 @@
<assembly
xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0 http://maven.apache.org/xsd/assembly-1.1.0.xsd">
<id></id>
<formats>
<format>dir</format>
</formats>
<includeBaseDirectory>false</includeBaseDirectory>
<fileSets>
<fileSet>
<directory>src/main/resources</directory>
<includes>
<include>plugin.json</include>
<include>plugin_job_template.json</include>
</includes>
<outputDirectory>plugin/reader/clickhousereader</outputDirectory>
</fileSet>
<fileSet>
<directory>target/</directory>
<includes>
<include>clickhousereader-0.0.1-SNAPSHOT.jar</include>
</includes>
<outputDirectory>plugin/reader/clickhousereader</outputDirectory>
</fileSet>
</fileSets>
<dependencySets>
<dependencySet>
<useProjectArtifact>false</useProjectArtifact>
<outputDirectory>plugin/reader/clickhousereader/libs</outputDirectory>
<scope>runtime</scope>
</dependencySet>
</dependencySets>
</assembly>

View File

@ -0,0 +1,85 @@
package com.alibaba.datax.plugin.reader.clickhousereader;
import java.sql.Array;
import java.sql.ResultSet;
import java.sql.ResultSetMetaData;
import java.sql.SQLException;
import java.sql.Types;
import java.util.List;
import com.alibaba.datax.common.element.Record;
import com.alibaba.datax.common.element.StringColumn;
import com.alibaba.datax.common.plugin.RecordSender;
import com.alibaba.datax.common.plugin.TaskPluginCollector;
import com.alibaba.datax.common.spi.Reader;
import com.alibaba.datax.common.util.Configuration;
import com.alibaba.datax.common.util.MessageSource;
import com.alibaba.datax.plugin.rdbms.reader.CommonRdbmsReader;
import com.alibaba.datax.plugin.rdbms.util.DataBaseType;
import com.alibaba.fastjson2.JSON;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class ClickhouseReader extends Reader {
private static final DataBaseType DATABASE_TYPE = DataBaseType.ClickHouse;
private static final Logger LOG = LoggerFactory.getLogger(ClickhouseReader.class);
public static class Job extends Reader.Job {
private Configuration jobConfig = null;
private CommonRdbmsReader.Job commonRdbmsReaderMaster;
@Override
public void init() {
this.jobConfig = super.getPluginJobConf();
this.commonRdbmsReaderMaster = new CommonRdbmsReader.Job(DATABASE_TYPE);
this.commonRdbmsReaderMaster.init(this.jobConfig);
}
@Override
public List<Configuration> split(int mandatoryNumber) {
return this.commonRdbmsReaderMaster.split(this.jobConfig, mandatoryNumber);
}
@Override
public void post() {
this.commonRdbmsReaderMaster.post(this.jobConfig);
}
@Override
public void destroy() {
this.commonRdbmsReaderMaster.destroy(this.jobConfig);
}
}
public static class Task extends Reader.Task {
private Configuration jobConfig;
private CommonRdbmsReader.Task commonRdbmsReaderSlave;
@Override
public void init() {
this.jobConfig = super.getPluginJobConf();
this.commonRdbmsReaderSlave = new CommonRdbmsReader.Task(DATABASE_TYPE, super.getTaskGroupId(), super.getTaskId());
this.commonRdbmsReaderSlave.init(this.jobConfig);
}
@Override
public void startRead(RecordSender recordSender) {
int fetchSize = this.jobConfig.getInt(com.alibaba.datax.plugin.rdbms.reader.Constant.FETCH_SIZE, 1000);
this.commonRdbmsReaderSlave.startRead(this.jobConfig, recordSender, super.getTaskPluginCollector(), fetchSize);
}
@Override
public void post() {
this.commonRdbmsReaderSlave.post(this.jobConfig);
}
@Override
public void destroy() {
this.commonRdbmsReaderSlave.destroy(this.jobConfig);
}
}
}

View File

@ -0,0 +1,6 @@
{
"name": "clickhousereader",
"class": "com.alibaba.datax.plugin.reader.clickhousereader.ClickhouseReader",
"description": "useScene: prod. mechanism: Jdbc connection using the database, execute select sql.",
"developer": "alibaba"
}

View File

@ -0,0 +1,16 @@
{
"name": "clickhousereader",
"parameter": {
"username": "username",
"password": "password",
"column": ["col1", "col2", "col3"],
"connection": [
{
"jdbcUrl": "jdbc:clickhouse://<host>:<port>[/<database>]",
"table": ["table1", "table2"]
}
],
"preSql": [],
"postSql": []
}
}

View File

@ -0,0 +1,57 @@
{
"job": {
"setting": {
"speed": {
"channel": 5
}
},
"content": [
{
"reader": {
"name": "clickhousereader",
"parameter": {
"username": "XXXX",
"password": "XXXX",
"column": [
"uint8_col",
"uint16_col",
"uint32_col",
"uint64_col",
"int8_col",
"int16_col",
"int32_col",
"int64_col",
"float32_col",
"float64_col",
"bool_col",
"str_col",
"fixedstr_col",
"uuid_col",
"date_col",
"datetime_col",
"enum_col",
"ary_uint8_col",
"ary_str_col",
"tuple_col",
"nullable_col",
"nested_col.nested_id",
"nested_col.nested_str",
"ipv4_col",
"ipv6_col",
"decimal_col"
],
"connection": [
{
"table": [
"all_type_tbl"
],
"jdbcUrl":["jdbc:clickhouse://XXXX:8123/default"]
}
]
}
},
"writer": {}
}
]
}
}

View File

@ -0,0 +1,34 @@
CREATE TABLE IF NOT EXISTS default.all_type_tbl
(
`uint8_col` UInt8,
`uint16_col` UInt16,
uint32_col UInt32,
uint64_col UInt64,
int8_col Int8,
int16_col Int16,
int32_col Int32,
int64_col Int64,
float32_col Float32,
float64_col Float64,
bool_col UInt8,
str_col String,
fixedstr_col FixedString(3),
uuid_col UUID,
date_col Date,
datetime_col DateTime,
enum_col Enum('hello' = 1, 'world' = 2),
ary_uint8_col Array(UInt8),
ary_str_col Array(String),
tuple_col Tuple(UInt8, String),
nullable_col Nullable(UInt8),
nested_col Nested
(
nested_id UInt32,
nested_str String
),
ipv4_col IPv4,
ipv6_col IPv6,
decimal_col Decimal(5,3)
)
ENGINE = MergeTree()
ORDER BY (uint8_col);

View File

@ -5,6 +5,7 @@ import com.alibaba.datax.common.exception.DataXException;
import java.math.BigDecimal;
import java.math.BigInteger;
import java.sql.Time;
import java.util.Date;
/**
@ -12,18 +13,54 @@ import java.util.Date;
*/
public class DateColumn extends Column {
private DateType subType = DateType.DATETIME;
private DateType subType = DateType.DATETIME;
public static enum DateType {
DATE, TIME, DATETIME
}
private int nanos = 0;
/**
* 构建值为null的DateColumn使用Date子类型为DATETIME
* */
public DateColumn() {
this((Long)null);
}
private int precision = -1;
public static enum DateType {
DATE, TIME, DATETIME
}
/**
* 构建值为time(java.sql.Time)的DateColumn使用Date子类型为TIME只有时间没有日期
*/
public DateColumn(Time time, int nanos, int jdbcPrecision) {
this(time);
if (time != null) {
setNanos(nanos);
}
if (jdbcPrecision == 10) {
setPrecision(0);
}
if (jdbcPrecision >= 12 && jdbcPrecision <= 17) {
setPrecision(jdbcPrecision - 11);
}
}
public long getNanos() {
return nanos;
}
public void setNanos(int nanos) {
this.nanos = nanos;
}
public int getPrecision() {
return precision;
}
public void setPrecision(int precision) {
this.precision = precision;
}
/**
* 构建值为null的DateColumn使用Date子类型为DATETIME
*/
public DateColumn() {
this((Long) null);
}
/**
* 构建值为stamp(Unix时间戳)的DateColumn使用Date子类型为DATETIME

View File

@ -77,8 +77,8 @@ public class VMInfo {
garbageCollectorMXBeanList = java.lang.management.ManagementFactory.getGarbageCollectorMXBeans();
memoryPoolMXBeanList = java.lang.management.ManagementFactory.getMemoryPoolMXBeans();
osInfo = runtimeMXBean.getVmVendor() + " " + runtimeMXBean.getSpecVersion() + " " + runtimeMXBean.getVmVersion();
jvmInfo = osMXBean.getName() + " " + osMXBean.getArch() + " " + osMXBean.getVersion();
jvmInfo = runtimeMXBean.getVmVendor() + " " + runtimeMXBean.getSpecVersion() + " " + runtimeMXBean.getVmVersion();
osInfo = osMXBean.getName() + " " + osMXBean.getArch() + " " + osMXBean.getVersion();
totalProcessorCount = osMXBean.getAvailableProcessors();
//构建startPhyOSStatus

View File

@ -0,0 +1,34 @@
package com.alibaba.datax.common.util;
import org.apache.commons.lang3.StringUtils;
import java.util.HashMap;
import java.util.Map;
/**
* @author jitongchen
* @date 2023/9/7 9:47 AM
*/
public class LimitLogger {
private static Map<String, Long> lastPrintTime = new HashMap<>();
public static void limit(String name, long limit, LoggerFunction function) {
if (StringUtils.isBlank(name)) {
name = "__all__";
}
if (limit <= 0) {
function.apply();
} else {
if (!lastPrintTime.containsKey(name)) {
lastPrintTime.put(name, System.currentTimeMillis());
function.apply();
} else {
if (System.currentTimeMillis() > lastPrintTime.get(name) + limit) {
lastPrintTime.put(name, System.currentTimeMillis());
function.apply();
}
}
}
}
}

View File

@ -0,0 +1,10 @@
package com.alibaba.datax.common.util;
/**
* @author molin.lxd
* @date 2021-05-09
*/
public interface LoggerFunction {
void apply();
}

View File

@ -29,7 +29,7 @@ public class MemoryChannel extends Channel {
private ReentrantLock lock;
private Condition notInsufficient, notEmpty;
private Condition notSufficient, notEmpty;
public MemoryChannel(final Configuration configuration) {
super(configuration);
@ -37,7 +37,7 @@ public class MemoryChannel extends Channel {
this.bufferSize = configuration.getInt(CoreConstant.DATAX_CORE_TRANSPORT_EXCHANGER_BUFFERSIZE);
lock = new ReentrantLock();
notInsufficient = lock.newCondition();
notSufficient = lock.newCondition();
notEmpty = lock.newCondition();
}
@ -75,7 +75,7 @@ public class MemoryChannel extends Channel {
lock.lockInterruptibly();
int bytes = getRecordBytes(rs);
while (memoryBytes.get() + bytes > this.byteCapacity || rs.size() > this.queue.remainingCapacity()) {
notInsufficient.await(200L, TimeUnit.MILLISECONDS);
notSufficient.await(200L, TimeUnit.MILLISECONDS);
}
this.queue.addAll(rs);
waitWriterTime += System.nanoTime() - startTime;
@ -116,7 +116,7 @@ public class MemoryChannel extends Channel {
waitReaderTime += System.nanoTime() - startTime;
int bytes = getRecordBytes(rs);
memoryBytes.addAndGet(-bytes);
notInsufficient.signalAll();
notSufficient.signalAll();
} catch (InterruptedException e) {
throw DataXException.asDataXException(
FrameworkErrorCode.RUNTIME_ERROR, e);

View File

@ -168,6 +168,7 @@ public final class ConfigParser {
boolean isDefaultPath = StringUtils.isBlank(pluginPath);
if (isDefaultPath) {
configuration.set("path", path);
configuration.set("loadType","jarLoader");
}
Configuration result = Configuration.newDefault();

View File

@ -15,7 +15,7 @@ import java.util.List;
/**
* 提供Jar隔离的加载机制会把传入的路径及其子路径以及路径中的jar文件加入到class path
*/
public class JarLoader extends URLClassLoader {
public class JarLoader extends URLClassLoader{
public JarLoader(String[] paths) {
this(paths, JarLoader.class.getClassLoader());
}

View File

@ -49,7 +49,7 @@ public class LoadUtil {
/**
* jarLoader的缓冲
*/
private static Map<String, JarLoader> jarLoaderCenter = new HashMap<String, JarLoader>();
private static Map<String, JarLoader> jarLoaderCenter = new HashMap();
/**
* 设置pluginConfigs方便后面插件来获取

View File

@ -79,6 +79,8 @@ create table if not exsits datax.sample1(a string, b int64, c date, d timestamp,
"writer": {
"name": "databendwriter",
"parameter": {
"writeMode": "replace",
"onConflictColumn": ["id"],
"username": "databend",
"password": "databend",
"column": ["a", "b", "c", "d", "e", "f", "g"],
@ -149,6 +151,16 @@ create table if not exsits datax.sample1(a string, b int64, c date, d timestamp,
* 必选: 否
* 默认值: 无
* 示例: ["select count(*) from datax.sample1"]
* writeMode
* 描述:写入模式,支持 insert 和 replace 两种模式,默认为 insert。若为 replace务必填写 onConflictColumn 参数
* 必选:否
* 默认值insert
* 示例:"replace"
* onConflictColumn
* 描述on conflict 字段,指定 writeMode 为 replace 后,需要此参数
* 必选:否
* 默认值:无
* 示例:["id","user"]
### 3.3 类型转化
DataX中的数据类型可以转换为databend中的相应数据类型。下表显示了两种类型之间的对应关系。

View File

@ -142,6 +142,16 @@ create table if not exsits datax.sample1(a string, b int64, c date, d timestamp,
* Description: A list of SQL statements that will be executed after the write operation.
* Required: no
* Default: none
* writeMode
* DescriptionThe write mode, support `insert` and `replace` two mode.
* Requiredno
* Defaultinsert
* Example"replace"
* onConflictColumn
* DescriptionOn conflict fields list.
* Requiredno
* Defaultnone
* Example["id","user"]
### 3.3 Type Convert
Data types in datax can be converted to the corresponding data types in databend. The following table shows the correspondence between the two types.

View File

@ -17,7 +17,7 @@
<dependency>
<groupId>com.databend</groupId>
<artifactId>databend-jdbc</artifactId>
<version>0.0.7</version>
<version>0.1.0</version>
</dependency>
<dependency>
<groupId>com.alibaba.datax</groupId>

View File

@ -17,20 +17,17 @@ import java.sql.*;
import java.util.List;
import java.util.regex.Pattern;
public class DatabendWriter extends Writer
{
public class DatabendWriter extends Writer {
private static final DataBaseType DATABASE_TYPE = DataBaseType.Databend;
public static class Job
extends Writer.Job
{
extends Writer.Job {
private static final Logger LOG = LoggerFactory.getLogger(Job.class);
private Configuration originalConfig;
private CommonRdbmsWriter.Job commonRdbmsWriterMaster;
@Override
public void init()
{
public void init() throws DataXException {
this.originalConfig = super.getPluginJobConf();
this.commonRdbmsWriterMaster = new CommonRdbmsWriter.Job(DATABASE_TYPE);
this.commonRdbmsWriterMaster.init(this.originalConfig);
@ -39,8 +36,7 @@ public class DatabendWriter extends Writer
}
@Override
public void preCheck()
{
public void preCheck() {
this.init();
this.commonRdbmsWriterMaster.writerPreCheck(this.originalConfig, DATABASE_TYPE);
}
@ -67,8 +63,7 @@ public class DatabendWriter extends Writer
}
public static class Task extends Writer.Task
{
public static class Task extends Writer.Task {
private static final Logger LOG = LoggerFactory.getLogger(Task.class);
private Configuration writerSliceConfig;
@ -76,11 +71,10 @@ public class DatabendWriter extends Writer
private CommonRdbmsWriter.Task commonRdbmsWriterSlave;
@Override
public void init()
{
public void init() {
this.writerSliceConfig = super.getPluginJobConf();
this.commonRdbmsWriterSlave = new CommonRdbmsWriter.Task(DataBaseType.Databend){
this.commonRdbmsWriterSlave = new CommonRdbmsWriter.Task(DataBaseType.Databend) {
@Override
protected PreparedStatement fillPreparedStatementColumnType(PreparedStatement preparedStatement, int columnIndex, int columnSqltype, String typeName, Column column) throws SQLException {
try {
@ -177,8 +171,8 @@ public class DatabendWriter extends Writer
case Types.BOOLEAN:
// warn: bit(1) -> Types.BIT 可使用setBoolean
// warn: bit(>1) -> Types.VARBINARY 可使用setBytes
// warn: bit(1) -> Types.BIT 可使用setBoolean
// warn: bit(>1) -> Types.VARBINARY 可使用setBytes
case Types.BIT:
if (this.dataBaseType == DataBaseType.MySql) {
Boolean asBoolean = column.asBoolean();
@ -224,8 +218,7 @@ public class DatabendWriter extends Writer
}
@Override
public void destroy()
{
public void destroy() {
this.commonRdbmsWriterSlave.destroy(this.writerSliceConfig);
}
@ -238,9 +231,9 @@ public class DatabendWriter extends Writer
public void post() {
this.commonRdbmsWriterSlave.post(this.writerSliceConfig);
}
@Override
public void startWrite(RecordReceiver lineReceiver)
{
public void startWrite(RecordReceiver lineReceiver) {
this.commonRdbmsWriterSlave.startWrite(lineReceiver, this.writerSliceConfig, this.getTaskPluginCollector());
}

View File

@ -0,0 +1,33 @@
package com.alibaba.datax.plugin.writer.databendwriter;
import com.alibaba.datax.common.spi.ErrorCode;
public enum DatabendWriterErrorCode implements ErrorCode {
CONF_ERROR("DatabendWriter-00", "配置错误."),
WRITE_DATA_ERROR("DatabendWriter-01", "写入数据时失败."),
;
private final String code;
private final String description;
private DatabendWriterErrorCode(String code, String description) {
this.code = code;
this.description = description;
}
@Override
public String getCode() {
return this.code;
}
@Override
public String getDescription() {
return this.description;
}
@Override
public String toString() {
return String.format("Code:[%s], Description:[%s].", this.code, this.description);
}
}

View File

@ -1,40 +1,72 @@
package com.alibaba.datax.plugin.writer.databendwriter.util;
import com.alibaba.datax.common.exception.DataXException;
import com.alibaba.datax.common.util.Configuration;
import com.alibaba.datax.plugin.rdbms.writer.Constant;
import com.alibaba.datax.plugin.rdbms.writer.Key;
import com.alibaba.datax.plugin.writer.databendwriter.DatabendWriterErrorCode;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.xml.crypto.Data;
import java.util.List;
import java.util.StringJoiner;
public final class DatabendWriterUtil
{
public final class DatabendWriterUtil {
private static final Logger LOG = LoggerFactory.getLogger(DatabendWriterUtil.class);
private DatabendWriterUtil() {}
public static void dealWriteMode(Configuration originalConfig)
{
private DatabendWriterUtil() {
}
public static void dealWriteMode(Configuration originalConfig) throws DataXException {
List<String> columns = originalConfig.getList(Key.COLUMN, String.class);
List<String> onConflictColumns = originalConfig.getList(Key.ONCONFLICT_COLUMN, String.class);
StringBuilder writeDataSqlTemplate = new StringBuilder();
String jdbcUrl = originalConfig.getString(String.format("%s[0].%s",
Constant.CONN_MARK, Key.JDBC_URL, String.class));
String writeMode = originalConfig.getString(Key.WRITE_MODE, "INSERT");
LOG.info("write mode is {}", writeMode);
if (writeMode.toLowerCase().contains("replace")) {
if (onConflictColumns == null || onConflictColumns.size() == 0) {
throw DataXException
.asDataXException(
DatabendWriterErrorCode.CONF_ERROR,
String.format(
"Replace mode must has onConflictColumn config."
));
}
StringBuilder writeDataSqlTemplate = new StringBuilder();
writeDataSqlTemplate.append("INSERT INTO %s");
StringJoiner columnString = new StringJoiner(",");
// for databend if you want to use replace mode, the writeMode should be: "writeMode": "replace"
writeDataSqlTemplate.append("REPLACE INTO %s (")
.append(StringUtils.join(columns, ",")).append(") ").append(onConFlictDoString(onConflictColumns))
.append(" VALUES");
for (String column : columns) {
columnString.add(column);
LOG.info("Replace data [\n{}\n], which jdbcUrl like:[{}]", writeDataSqlTemplate, jdbcUrl);
originalConfig.set(Constant.INSERT_OR_REPLACE_TEMPLATE_MARK, writeDataSqlTemplate);
} else {
writeDataSqlTemplate.append("INSERT INTO %s");
StringJoiner columnString = new StringJoiner(",");
for (String column : columns) {
columnString.add(column);
}
writeDataSqlTemplate.append(String.format("(%s)", columnString));
writeDataSqlTemplate.append(" VALUES");
LOG.info("Insert data [\n{}\n], which jdbcUrl like:[{}]", writeDataSqlTemplate, jdbcUrl);
originalConfig.set(Constant.INSERT_OR_REPLACE_TEMPLATE_MARK, writeDataSqlTemplate);
}
writeDataSqlTemplate.append(String.format("(%s)", columnString));
writeDataSqlTemplate.append(" VALUES");
LOG.info("Write data [\n{}\n], which jdbcUrl like:[{}]", writeDataSqlTemplate, jdbcUrl);
originalConfig.set(Constant.INSERT_OR_REPLACE_TEMPLATE_MARK, writeDataSqlTemplate);
}
}
public static String onConFlictDoString(List<String> conflictColumns) {
return " ON " +
"(" +
StringUtils.join(conflictColumns, ",") + ") ";
}
}

View File

@ -0,0 +1,20 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>com.alibaba.datax</groupId>
<artifactId>datax-example</artifactId>
<version>0.0.1-SNAPSHOT</version>
</parent>
<artifactId>datax-example-core</artifactId>
<properties>
<maven.compiler.source>8</maven.compiler.source>
<maven.compiler.target>8</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
</project>

View File

@ -0,0 +1,26 @@
package com.alibaba.datax.example;
import com.alibaba.datax.common.util.Configuration;
import com.alibaba.datax.core.Engine;
import com.alibaba.datax.example.util.ExampleConfigParser;
/**
* {@code Date} 2023/8/6 11:22
*
* @author fuyouj
*/
public class ExampleContainer {
/**
* example对外暴露的启动入口
* 使用前最好看下 datax-example/doc/README.MD
* @param jobPath 任务json绝对路径
*/
public static void start(String jobPath) {
Configuration configuration = ExampleConfigParser.parse(jobPath);
Engine engine = new Engine();
engine.start(configuration);
}
}

View File

@ -0,0 +1,23 @@
package com.alibaba.datax.example;
import com.alibaba.datax.example.util.PathUtil;
/**
* @author fuyouj
*/
public class Main {
/**
* 1.在example模块pom文件添加你依赖的的调试插件
* 你可以直接打开本模块的pom文件,参考是如何引入streamreaderstreamwriter
* 2. 在此处指定你的job文件
*/
public static void main(String[] args) {
String classPathJobPath = "/job/stream2stream.json";
String absJobPath = PathUtil.getAbsolutePathFromClassPath(classPathJobPath);
ExampleContainer.start(absJobPath);
}
}

View File

@ -0,0 +1,154 @@
package com.alibaba.datax.example.util;
import com.alibaba.datax.common.exception.DataXException;
import com.alibaba.datax.common.util.Configuration;
import com.alibaba.datax.core.util.ConfigParser;
import com.alibaba.datax.core.util.FrameworkErrorCode;
import com.alibaba.datax.core.util.container.CoreConstant;
import java.io.File;
import java.io.IOException;
import java.net.URL;
import java.nio.file.Paths;
import java.util.*;
/**
* @author fuyouj
*/
public class ExampleConfigParser {
private static final String CORE_CONF = "/example/conf/core.json";
private static final String PLUGIN_DESC_FILE = "plugin.json";
/**
* 指定Job配置路径ConfigParser会解析JobPluginCore全部信息并以Configuration返回
* 不同于Core的ConfigParser,这里的core,plugin 不依赖于编译后的datax.home,而是扫描程序编译后的target目录
*/
public static Configuration parse(final String jobPath) {
Configuration configuration = ConfigParser.parseJobConfig(jobPath);
configuration.merge(coreConfig(),
false);
Map<String, String> pluginTypeMap = new HashMap<>();
String readerName = configuration.getString(CoreConstant.DATAX_JOB_CONTENT_READER_NAME);
String writerName = configuration.getString(CoreConstant.DATAX_JOB_CONTENT_WRITER_NAME);
pluginTypeMap.put(readerName, "reader");
pluginTypeMap.put(writerName, "writer");
Configuration pluginsDescConfig = parsePluginsConfig(pluginTypeMap);
configuration.merge(pluginsDescConfig, false);
return configuration;
}
private static Configuration parsePluginsConfig(Map<String, String> pluginTypeMap) {
Configuration configuration = Configuration.newDefault();
//最初打算通过user.dir获取工作目录来扫描插件
//但是user.dir在不同有一些不确定性所以废弃了这个选择
for (File basePackage : runtimeBasePackages()) {
if (pluginTypeMap.isEmpty()) {
break;
}
scanPluginByPackage(basePackage, configuration, basePackage.listFiles(), pluginTypeMap);
}
if (!pluginTypeMap.isEmpty()) {
String failedPlugin = pluginTypeMap.keySet().toString();
String message = "\nplugin %s load failed ry to analyze the reasons from the following aspects.。\n" +
"1: Check if the name of the plugin is spelled correctly, and verify whether DataX supports this plugin\n" +
"2Verify if the <resource></resource> tag has been added under <build></build> section in the pom file of the relevant plugin.\n<resource>" +
" <directory>src/main/resources</directory>\n" +
" <includes>\n" +
" <include>**/*.*</include>\n" +
" </includes>\n" +
" <filtering>true</filtering>\n" +
" </resource>\n [Refer to the streamreader pom file] \n" +
"3: Check that the datax-yourPlugin-example module imported your test plugin";
message = String.format(message, failedPlugin);
throw DataXException.asDataXException(FrameworkErrorCode.PLUGIN_INIT_ERROR, message);
}
return configuration;
}
/**
* 通过classLoader获取程序编译的输出目录
*
* @return File[/datax-example/target/classes,xxReader/target/classes,xxWriter/target/classes]
*/
private static File[] runtimeBasePackages() {
List<File> basePackages = new ArrayList<>();
ClassLoader classLoader = Thread.currentThread().getContextClassLoader();
Enumeration<URL> resources = null;
try {
resources = classLoader.getResources("");
} catch (IOException e) {
throw DataXException.asDataXException(e.getMessage());
}
while (resources.hasMoreElements()) {
URL resource = resources.nextElement();
File file = new File(resource.getFile());
if (file.isDirectory()) {
basePackages.add(file);
}
}
return basePackages.toArray(new File[0]);
}
/**
* @param packageFile 编译出来的target/classes根目录 便于找到插件时设置插件的URL目录设置根目录是最保险的方式
* @param configuration pluginConfig
* @param files 待扫描文件
* @param needPluginTypeMap 需要的插件
*/
private static void scanPluginByPackage(File packageFile,
Configuration configuration,
File[] files,
Map<String, String> needPluginTypeMap) {
if (files == null) {
return;
}
for (File file : files) {
if (file.isFile() && PLUGIN_DESC_FILE.equals(file.getName())) {
Configuration pluginDesc = Configuration.from(file);
String descPluginName = pluginDesc.getString("name", "");
if (needPluginTypeMap.containsKey(descPluginName)) {
String type = needPluginTypeMap.get(descPluginName);
configuration.merge(parseOnePlugin(packageFile.getAbsolutePath(), type, descPluginName, pluginDesc), false);
needPluginTypeMap.remove(descPluginName);
}
} else {
scanPluginByPackage(packageFile, configuration, file.listFiles(), needPluginTypeMap);
}
}
}
private static Configuration parseOnePlugin(String packagePath,
String pluginType,
String pluginName,
Configuration pluginDesc) {
//设置path 兼容jarLoader的加载方式URLClassLoader
pluginDesc.set("path", packagePath);
Configuration pluginConfInJob = Configuration.newDefault();
pluginConfInJob.set(
String.format("plugin.%s.%s", pluginType, pluginName),
pluginDesc.getInternal());
return pluginConfInJob;
}
private static Configuration coreConfig() {
try {
URL resource = ExampleConfigParser.class.getResource(CORE_CONF);
return Configuration.from(Paths.get(resource.toURI()).toFile());
} catch (Exception ignore) {
throw DataXException.asDataXException("Failed to load the configuration file core.json. " +
"Please check whether /example/conf/core.json exists!");
}
}
}

View File

@ -0,0 +1,26 @@
package com.alibaba.datax.example.util;
import com.alibaba.datax.common.exception.DataXException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.nio.file.Paths;
/**
* @author fuyouj
*/
public class PathUtil {
public static String getAbsolutePathFromClassPath(String path) {
URL resource = PathUtil.class.getResource(path);
try {
assert resource != null;
URI uri = resource.toURI();
return Paths.get(uri).toString();
} catch (NullPointerException | URISyntaxException e) {
throw DataXException.asDataXException("path error,please check whether the path is correct");
}
}
}

View File

@ -0,0 +1,60 @@
{
"entry": {
"jvm": "-Xms1G -Xmx1G",
"environment": {}
},
"common": {
"column": {
"datetimeFormat": "yyyy-MM-dd HH:mm:ss",
"timeFormat": "HH:mm:ss",
"dateFormat": "yyyy-MM-dd",
"extraFormats":["yyyyMMdd"],
"timeZone": "GMT+8",
"encoding": "utf-8"
}
},
"core": {
"dataXServer": {
"address": "http://localhost:7001/api",
"timeout": 10000,
"reportDataxLog": false,
"reportPerfLog": false
},
"transport": {
"channel": {
"class": "com.alibaba.datax.core.transport.channel.memory.MemoryChannel",
"speed": {
"byte": -1,
"record": -1
},
"flowControlInterval": 20,
"capacity": 512,
"byteCapacity": 67108864
},
"exchanger": {
"class": "com.alibaba.datax.core.plugin.BufferedRecordExchanger",
"bufferSize": 32
}
},
"container": {
"job": {
"reportInterval": 10000
},
"taskGroup": {
"channel": 5
},
"trace": {
"enable": "false"
}
},
"statistics": {
"collector": {
"plugin": {
"taskClass": "com.alibaba.datax.core.statistics.plugin.task.StdoutPluginCollector",
"maxDirtyNumber": 10
}
}
}
}
}

View File

@ -0,0 +1,19 @@
package com.alibaba.datax.example.util;
import org.junit.Assert;
import org.junit.Test;
/**
* {@code Author} FuYouJ
* {@code Date} 2023/8/19 21:38
*/
public class PathUtilTest {
@Test
public void testParseClassPathFile() {
String path = "/pathTest.json";
String absolutePathFromClassPath = PathUtil.getAbsolutePathFromClassPath(path);
Assert.assertNotNull(absolutePathFromClassPath);
}
}

View File

@ -0,0 +1,43 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>com.alibaba.datax</groupId>
<artifactId>datax-example</artifactId>
<version>0.0.1-SNAPSHOT</version>
</parent>
<artifactId>datax-example-neo4j</artifactId>
<properties>
<maven.compiler.source>8</maven.compiler.source>
<maven.compiler.target>8</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<test.container.version>1.17.6</test.container.version>
<neo4j-java-driver.version>4.4.9</neo4j-java-driver.version>
</properties>
<dependencies>
<dependency>
<groupId>com.alibaba.datax</groupId>
<artifactId>datax-example-core</artifactId>
<version>0.0.1-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>org.testcontainers</groupId>
<artifactId>testcontainers</artifactId>
<version>${test.container.version}</version>
</dependency>
<dependency>
<groupId>com.alibaba.datax</groupId>
<artifactId>neo4jwriter</artifactId>
<version>0.0.1-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>com.alibaba.datax</groupId>
<artifactId>datax-example-streamreader</artifactId>
<version>0.0.1-SNAPSHOT</version>
</dependency>
</dependencies>
</project>

View File

@ -0,0 +1,138 @@
package com.alibaba.datax.example.neo4j;
import com.alibaba.datax.example.ExampleContainer;
import com.alibaba.datax.example.util.PathUtil;
import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
import org.neo4j.driver.*;
import org.neo4j.driver.types.Node;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.testcontainers.containers.GenericContainer;
import org.testcontainers.containers.Network;
import org.testcontainers.containers.output.Slf4jLogConsumer;
import org.testcontainers.lifecycle.Startables;
import org.testcontainers.shaded.org.awaitility.Awaitility;
import org.testcontainers.utility.DockerImageName;
import org.testcontainers.utility.DockerLoggerFactory;
import java.net.URI;
import java.util.Arrays;
import java.util.concurrent.TimeUnit;
import java.util.stream.Stream;
/**
* {@code Author} FuYouJ
* {@code Date} 2023/8/19 21:48
*/
public class StreamReader2Neo4jWriterTest {
private static final Logger LOGGER = LoggerFactory.getLogger(StreamReader2Neo4jWriterTest.class);
private static final String CONTAINER_IMAGE = "neo4j:5.9.0";
private static final String CONTAINER_HOST = "neo4j-host";
private static final int HTTP_PORT = 7474;
private static final int BOLT_PORT = 7687;
private static final String CONTAINER_NEO4J_USERNAME = "neo4j";
private static final String CONTAINER_NEO4J_PASSWORD = "Test@12343";
private static final URI CONTAINER_URI = URI.create("neo4j://localhost:" + BOLT_PORT);
protected static final Network NETWORK = Network.newNetwork();
private GenericContainer<?> container;
protected Driver neo4jDriver;
protected Session neo4jSession;
private static final int CHANNEL = 5;
private static final int READER_NUM = 10;
@Before
public void init() {
DockerImageName imageName = DockerImageName.parse(CONTAINER_IMAGE);
container =
new GenericContainer<>(imageName)
.withNetwork(NETWORK)
.withNetworkAliases(CONTAINER_HOST)
.withExposedPorts(HTTP_PORT, BOLT_PORT)
.withEnv(
"NEO4J_AUTH",
CONTAINER_NEO4J_USERNAME + "/" + CONTAINER_NEO4J_PASSWORD)
.withEnv("apoc.export.file.enabled", "true")
.withEnv("apoc.import.file.enabled", "true")
.withEnv("apoc.import.file.use_neo4j_config", "true")
.withEnv("NEO4J_PLUGINS", "[\"apoc\"]")
.withLogConsumer(
new Slf4jLogConsumer(
DockerLoggerFactory.getLogger(CONTAINER_IMAGE)));
container.setPortBindings(
Arrays.asList(
String.format("%s:%s", HTTP_PORT, HTTP_PORT),
String.format("%s:%s", BOLT_PORT, BOLT_PORT)));
Startables.deepStart(Stream.of(container)).join();
LOGGER.info("container started");
Awaitility.given()
.ignoreExceptions()
.await()
.atMost(30, TimeUnit.SECONDS)
.untilAsserted(this::initConnection);
}
//在neo4jWriter模块使用Example测试整个job,方便发现整个流程的代码问题
@Test
public void streamReader2Neo4j() {
deleteHistoryIfExist();
String path = "/streamreader2neo4j.json";
String jobPath = PathUtil.getAbsolutePathFromClassPath(path);
ExampleContainer.start(jobPath);
//根据channel和reader的mock数据校验结果集是否符合预期
verifyWriteResult();
}
private void deleteHistoryIfExist() {
String query = "match (n:StreamReader) return n limit 1";
String delete = "match (n:StreamReader) delete n";
if (neo4jSession.run(query).hasNext()) {
neo4jSession.run(delete);
}
}
private void verifyWriteResult() {
int total = CHANNEL * READER_NUM;
String query = "match (n:StreamReader) return n";
Result run = neo4jSession.run(query);
int count = 0;
while (run.hasNext()) {
Record record = run.next();
Node node = record.get("n").asNode();
if (node.hasLabel("StreamReader")) {
count++;
}
}
Assert.assertEquals(count, total);
}
@After
public void destroy() {
if (neo4jSession != null) {
neo4jSession.close();
}
if (neo4jDriver != null) {
neo4jDriver.close();
}
if (container != null) {
container.close();
}
}
private void initConnection() {
neo4jDriver =
GraphDatabase.driver(
CONTAINER_URI,
AuthTokens.basic(CONTAINER_NEO4J_USERNAME, CONTAINER_NEO4J_PASSWORD));
neo4jSession = neo4jDriver.session(SessionConfig.forDatabase("neo4j"));
}
}

View File

@ -0,0 +1,51 @@
{
"job": {
"content": [
{
"reader": {
"name": "streamreader",
"parameter": {
"sliceRecordCount": 10,
"column": [
{
"type": "string",
"value": "StreamReader"
},
{
"type": "string",
"value": "1997"
}
]
}
},
"writer": {
"name": "neo4jWriter",
"parameter": {
"uri": "bolt://localhost:7687",
"username":"neo4j",
"password":"Test@12343",
"database":"neo4j",
"cypher": "unwind $batch as row CALL apoc.cypher.doIt( 'create (n:`' + row.Label + '`{id:$id})' ,{id: row.id} ) YIELD value RETURN 1 ",
"batchDataVariableName": "batch",
"batchSize": "3",
"properties": [
{
"name": "Label",
"type": "string"
},
{
"name": "id",
"type": "STRING"
}
]
}
}
}
],
"setting": {
"speed": {
"channel": 5
}
}
}
}

View File

@ -0,0 +1,37 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>com.alibaba.datax</groupId>
<artifactId>datax-example</artifactId>
<version>0.0.1-SNAPSHOT</version>
</parent>
<artifactId>datax-example-streamreader</artifactId>
<properties>
<maven.compiler.source>8</maven.compiler.source>
<maven.compiler.target>8</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<dependencies>
<dependency>
<groupId>com.alibaba.datax</groupId>
<artifactId>datax-example-core</artifactId>
<version>0.0.1-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>com.alibaba.datax</groupId>
<artifactId>streamreader</artifactId>
<version>0.0.1-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>com.alibaba.datax</groupId>
<artifactId>streamwriter</artifactId>
<version>0.0.1-SNAPSHOT</version>
</dependency>
</dependencies>
</project>

View File

@ -0,0 +1,19 @@
package com.alibaba.datax.example.streamreader;
import com.alibaba.datax.example.ExampleContainer;
import com.alibaba.datax.example.util.PathUtil;
import org.junit.Test;
/**
* {@code Author} FuYouJ
* {@code Date} 2023/8/14 20:16
*/
public class StreamReader2StreamWriterTest {
@Test
public void testStreamReader2StreamWriter() {
String path = "/stream2stream.json";
String jobPath = PathUtil.getAbsolutePathFromClassPath(path);
ExampleContainer.start(jobPath);
}
}

View File

@ -0,0 +1,36 @@
{
"job": {
"content": [
{
"reader": {
"name": "streamreader",
"parameter": {
"sliceRecordCount": 10,
"column": [
{
"type": "long",
"value": "10"
},
{
"type": "string",
"value": "hello你好世界-DataX"
}
]
}
},
"writer": {
"name": "streamwriter",
"parameter": {
"encoding": "UTF-8",
"print": true
}
}
}
],
"setting": {
"speed": {
"channel": 5
}
}
}
}

107
datax-example/doc/README.md Normal file
View File

@ -0,0 +1,107 @@
## [DataX-Example]调试datax插件的模块
### 为什么要开发这个模块
一般使用DataX启动数据同步任务是从datax.py 脚本开始获取程序datax包目录设置到系统变量datax.home里此后系统核心插件的加载配置初始化均依赖于变量datax.home,这带来了一些麻烦,以一次本地 DeBug streamreader 插件为例。
- maven 打包 datax 生成 datax 目录
- 在 IDE 中 设置系统环境变量 datax.home或者在Engine启动类中硬编码设置datax.home。
- 修改插件 streamreader 代码
- 再次 maven 打包使JarLoader 能够加载到最新的 streamreader 代码。
- 调试代码
在以上步骤中,打包完全不必要且最耗时,等待打包也最煎熬。
所以我编写一个新的模块(datax-example),此模块特用于本地调试和复现 BUG。如果模块顺利编写完成那么以上流程将被简化至两步。
- 修改插件 streamreader 代码。
- 调试代码
<img src="img/img01.png" alt="img" style="zoom:40%;" />
### 目录结构
该目录结构演示了如何使用datax-example-core编写测试用例和校验代码流程。
<img src="img/img03.png" alt="img" style="zoom:100%;" />
### 实现原理
- 不修改原有的ConfigParer,使用新的ExampleConfigParser,仅用于example模块。他不依赖datax.home,而是依赖ide编译后的target目录
- 将ide的target目录作为每个插件的目录类加载目录。
![img](img/img02.png)
### 如何使用
1.修改插件的pom文件做如下改动。以streamreader为例。<br/>
改动前
```xml
<build>
<plugins>
<!-- compiler plugin -->
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>${jdk-version}</source>
<target>${jdk-version}</target>
<encoding>${project-sourceEncoding}</encoding>
</configuration>
</plugin>
</plugins>
</build>
```
改动后
```xml
<build>
<resources>
<!--将resource目录也输出到target-->
<resource>
<directory>src/main/resources</directory>
<includes>
<include>**/*.*</include>
</includes>
<filtering>true</filtering>
</resource>
</resources>
<plugins>
<!-- compiler plugin -->
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>${jdk-version}</source>
<target>${jdk-version}</target>
<encoding>${project-sourceEncoding}</encoding>
</configuration>
</plugin>
</plugins>
</build>
```
#### 在测试模块模块使用
参考datax-example/datax-example-streamreader的StreamReader2StreamWriterTest.java
```java
public class StreamReader2StreamWriterTest {
@Test
public void testStreamReader2StreamWriter() {
String path = "/stream2stream.json";
String jobPath = PathUtil.getAbsolutePathFromClassPath(path);
ExampleContainer.start(jobPath);
}
}
```
参考datax-example/datax-example-neo4j的StreamReader2Neo4jWriterTest
```java
public class StreamReader2Neo4jWriterTest{
@Test
public void streamReader2Neo4j() {
deleteHistoryIfExist();
String path = "/streamreader2neo4j.json";
String jobPath = PathUtil.getAbsolutePathFromClassPath(path);
ExampleContainer.start(jobPath);
//根据channel和reader的mock数据校验结果集是否符合预期
verifyWriteResult();
}
}
```

Binary file not shown.

After

Width:  |  Height:  |  Size: 71 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 66 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 43 KiB

68
datax-example/pom.xml Normal file
View File

@ -0,0 +1,68 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>com.alibaba.datax</groupId>
<artifactId>datax-all</artifactId>
<version>0.0.1-SNAPSHOT</version>
</parent>
<artifactId>datax-example</artifactId>
<packaging>pom</packaging>
<modules>
<module>datax-example-core</module>
<module>datax-example-streamreader</module>
<module>datax-example-neo4j</module>
</modules>
<properties>
<maven.compiler.source>8</maven.compiler.source>
<maven.compiler.target>8</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<junit4.version>4.13.2</junit4.version>
</properties>
<dependencies>
<dependency>
<groupId>com.alibaba.datax</groupId>
<artifactId>datax-common</artifactId>
<version>0.0.1-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>com.alibaba.datax</groupId>
<artifactId>datax-core</artifactId>
<version>0.0.1-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>${junit4.version}</version>
<scope>test</scope>
</dependency>
</dependencies>
<build>
<resources>
<resource>
<directory>src/main/resources</directory>
<includes>
<include>**/*.*</include>
</includes>
<filtering>true</filtering>
</resource>
</resources>
<plugins>
<!-- compiler plugin -->
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>${jdk-version}</source>
<target>${jdk-version}</target>
<encoding>${project-sourceEncoding}</encoding>
</configuration>
</plugin>
</plugins>
</build>
</project>

View File

@ -447,6 +447,9 @@ DataX的内部类型在实现上会选用不同的java类型
3. 用户在插件中在`reader`/`writer`配置的`name`字段指定插件名字。框架根据插件的类型(`reader`/`writer`和插件名称去插件的路径下扫描所有的jar加入`classpath`。
4. 根据插件配置中定义的入口类,框架通过反射实例化对应的`Job`和`Task`对象。
### 编写测试用例
1. 在datax-example工程下新建新的插件测试模块,调用`ExampleContainer.start(jobPath)`方法来检测你的代码逻辑是否正确。[datax-example使用](https://github.com/alibaba/DataX/blob/master/datax-example/doc/README.md)
## 三、Last but not Least

View File

@ -45,7 +45,7 @@
<dependency>
<groupId>com.jcraft</groupId>
<artifactId>jsch</artifactId>
<version>0.1.51</version>
<version>0.1.54</version>
</dependency>
<dependency>
<groupId>commons-net</groupId>
@ -89,4 +89,4 @@
</plugins>
</build>
</project>
</project>

View File

@ -64,6 +64,8 @@ public class SftpHelper extends FtpHelper {
String message = String.format("请确认连接ftp服务器端口是否正确错误的端口: [%s] ", port);
LOG.error(message);
throw DataXException.asDataXException(FtpReaderErrorCode.FAIL_LOGIN, message, e);
}else{
throw DataXException.asDataXException(FtpReaderErrorCode.COMMAND_FTP_IO_EXCEPTION, "", e);
}
}else {
if("Auth fail".equals(e.getMessage())){

View File

@ -45,7 +45,7 @@
<dependency>
<groupId>com.jcraft</groupId>
<artifactId>jsch</artifactId>
<version>0.1.51</version>
<version>0.1.54</version>
</dependency>
<dependency>
<groupId>commons-net</groupId>

View File

@ -0,0 +1,297 @@
# GaussDbReader 插件文档
___
## 1 快速介绍
GaussDbReader插件实现了从GaussDB读取数据。在底层实现上GaussDbReader通过JDBC连接远程GaussDB数据库并执行相应的sql语句将数据从GaussDB库中SELECT出来。
## 2 实现原理
简而言之GaussDbReader通过JDBC连接器连接到远程的GaussDB数据库并根据用户配置的信息生成查询SELECT SQL语句并发送到远程GaussDB数据库并将该SQL执行返回结果使用DataX自定义的数据类型拼装为抽象的数据集并传递给下游Writer处理。
对于用户配置Table、Column、Where的信息GaussDbReader将其拼接为SQL语句发送到GaussDB数据库对于用户配置querySql信息GaussDbReader直接将其发送到GaussDB数据库。
## 3 功能说明
### 3.1 配置样例
* 配置一个从GaussDB数据库同步抽取数据到本地的作业:
```
{
"job": {
"setting": {
"speed": {
//设置传输速度单位为byte/sDataX运行会尽可能达到该速度但是不超过它.
"byte": 1048576
},
//出错限制
"errorLimit": {
//出错的record条数上限当大于该值即报错。
"record": 0,
//出错的record百分比上限 1.0表示100%0.02表示2%
"percentage": 0.02
}
},
"content": [
{
"reader": {
"name": "gaussdbreader",
"parameter": {
// 数据库连接用户名
"username": "xx",
// 数据库连接密码
"password": "xx",
"column": [
"id","name"
],
//切分主键
"splitPk": "id",
"connection": [
{
"table": [
"table"
],
"jdbcUrl": [
"jdbc:opengauss://host:port/database"
]
}
]
}
},
"writer": {
//writer类型
"name": "streamwriter",
//是否打印内容
"parameter": {
"print":true,
}
}
}
]
}
}
```
* 配置一个自定义SQL的数据库同步任务到本地内容的作业
```json
{
"job": {
"setting": {
"speed": 1048576
},
"content": [
{
"reader": {
"name": "gaussdbreader",
"parameter": {
"username": "xx",
"password": "xx",
"where": "",
"connection": [
{
"querySql": [
"select db_id,on_line_flag from db_info where db_id < 10;"
],
"jdbcUrl": [
"jdbc:opengauss://host:port/database", "jdbc:opengauss://host:port/database"
]
}
]
}
},
"writer": {
"name": "streamwriter",
"parameter": {
"print": false,
"encoding": "UTF-8"
}
}
}
]
}
}
```
### 3.2 参数说明
* **jdbcUrl**
* 描述描述的是到对端数据库的JDBC连接信息使用JSON的数组描述并支持一个库填写多个连接地址。之所以使用JSON数组描述连接信息是因为阿里集团内部支持多个IP探测如果配置了多个GaussDbReader可以依次探测ip的可连接性直到选择一个合法的IP。如果全部连接失败GaussDbReader报错。 注意jdbcUrl必须包含在connection配置单元中。对于阿里集团外部使用情况JSON数组填写一个JDBC连接即可。
jdbcUrl按照GaussDB官方规范并可以填写连接附件控制信息。具体请参看[GaussDB官方文档](https://docs.opengauss.org/zh/docs/3.1.0/docs/Developerguide/java-sql-Connection.html)。
* 必选:是 <br />
* 默认值:无 <br />
* **username**
* 描述:数据源的用户名 <br />
* 必选:是 <br />
* 默认值:无 <br />
* **password**
* 描述:数据源指定用户名的密码 <br />
* 必选:是 <br />
* 默认值:无 <br />
* **table**
* 描述所选取的需要同步的表。使用JSON的数组描述因此支持多张表同时抽取。当配置为多张表时用户自己需保证多张表是同一schema结构GaussDbReader不予检查表是否同一逻辑表。注意table必须包含在connection配置单元中。<br />
* 必选:是 <br />
* 默认值:无 <br />
* **column**
* 描述所配置的表中需要同步的列名集合使用JSON的数组描述字段信息。用户使用\*代表默认使用所有列配置,例如['\*']。
支持列裁剪,即列可以挑选部分列进行导出。
支持列换序即列可以不按照表schema信息进行导出。
支持常量配置用户需要按照GaussDB语法格式:
["id", "'hello'::varchar", "true", "2.5::real", "power(2,3)"]
id为普通列名'hello'::varchar为字符串常量true为布尔值2.5为浮点数, power(2,3)为函数。
**column必须用户显示指定同步的列集合不允许为空**
* 必选:是 <br />
* 默认值:无 <br />
* **splitPk**
* 描述GaussDbReader进行数据抽取时如果指定splitPk表示用户希望使用splitPk代表的字段进行数据分片DataX因此会启动并发任务进行数据同步这样可以大大提高数据同步的效能。
推荐splitPk用户使用表主键因为表主键通常情况下比较均匀因此切分出来的分片也不容易出现数据热点。
目前splitPk仅支持整形数据切分`不支持浮点、字符串型、日期等其他类型`。如果用户指定其他非支持类型GaussDbReader将报错
splitPk设置为空底层将视作用户不允许对单表进行切分因此使用单通道进行抽取。
* 必选:否 <br />
* 默认值:空 <br />
* **where**
* 描述筛选条件GaussDbReader根据指定的column、table、where条件拼接SQL并根据这个SQL进行数据抽取。在实际业务场景中往往会选择当天的数据进行同步可以将where条件指定为gmt_create > $bizdate 。注意不可以将where条件指定为limit 10limit不是SQL的合法where子句。<br />
where条件可以有效地进行业务增量同步。 where条件不配置或者为空视作全表同步数据。
* 必选:否 <br />
* 默认值:无 <br />
* **querySql**
* 描述在有些业务场景下where这一配置项不足以描述所筛选的条件用户可以通过该配置型来自定义筛选SQL。当用户配置了这一项之后DataX系统就会忽略tablecolumn这些配置型直接使用这个配置项的内容对数据进行筛选例如需要进行多表join后同步数据使用select a,b from table_a join table_b on table_a.id = table_b.id <br />
`当用户配置querySql时GaussDbReader直接忽略table、column、where条件的配置`
* 必选:否 <br />
* 默认值:无 <br />
* **fetchSize**
* 描述该配置项定义了插件和数据库服务器端每次批量数据获取条数该值决定了DataX和服务器端的网络交互次数能够较大的提升数据抽取性能。<br />
`注意,该值过大(>2048)可能造成DataX进程OOM。`
* 必选:否 <br />
* 默认值1024 <br />
### 3.3 类型转换
目前GaussDbReader支持大部分GaussDB类型但也存在部分个别类型没有支持的情况请注意检查你的类型。
下面列出GaussDbReader针对GaussDB类型转换列表:
| DataX 内部类型| GaussDB 数据类型 |
| -------- | ----- |
| Long |bigint, bigserial, integer, smallint, serial |
| Double |double precision, money, numeric, real |
| String |varchar, char, text, bit, inet|
| Date |date, time, timestamp |
| Boolean |bool|
| Bytes |bytea|
请注意:
* `除上述罗列字段类型外,其他类型均不支持; money,inet,bit需用户使用a_inet::varchar类似的语法转换`
## 4 性能报告
### 4.1 环境准备
#### 4.1.1 数据特征
建表语句:
create table pref_test(
id serial,
a_bigint bigint,
a_bit bit(10),
a_boolean boolean,
a_char character(5),
a_date date,
a_double double precision,
a_integer integer,
a_money money,
a_num numeric(10,2),
a_real real,
a_smallint smallint,
a_text text,
a_time time,
a_timestamp timestamp
)
#### 4.1.2 机器参数
* 执行DataX的机器参数为:
1. cpu: 16核 Intel(R) Xeon(R) CPU E5620 @ 2.40GHz
2. mem: MemTotal: 24676836kB MemFree: 6365080kB
3. net: 百兆双网卡
* GaussDB数据库机器参数为:
D12 24逻辑核 192G内存 12*480G SSD 阵列
### 4.2 测试报告
#### 4.2.1 单表测试报告
| 通道数 | 是否按照主键切分 | DataX速度(Rec/s) | DataX流量(MB/s) | DataX机器运行负载 |
|--------|--------| --------|--------|--------|
|1| 否 | 10211 | 0.63 | 0.2 |
|1| 是 | 10211 | 0.63 | 0.2 |
|4| 否 | 10211 | 0.63 | 0.2 |
|4| 是 | 40000 | 2.48 | 0.5 |
|8| 否 | 10211 | 0.63 | 0.2 |
|8| 是 | 78048 | 4.84 | 0.8 |
说明:
1. 这里的单表,主键类型为 serial数据分布均匀。
2. 对单表如果没有按照主键切分那么配置通道个数不会提升速度效果与1个通道一样。

86
gaussdbreader/pom.xml Normal file
View File

@ -0,0 +1,86 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>datax-all</artifactId>
<groupId>com.alibaba.datax</groupId>
<version>0.0.1-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>gaussdbreader</artifactId>
<name>gaussdbreader</name>
<packaging>jar</packaging>
<dependencies>
<dependency>
<groupId>com.alibaba.datax</groupId>
<artifactId>datax-common</artifactId>
<version>${datax-project-version}</version>
<exclusions>
<exclusion>
<artifactId>slf4j-log4j12</artifactId>
<groupId>org.slf4j</groupId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
</dependency>
<dependency>
<groupId>ch.qos.logback</groupId>
<artifactId>logback-classic</artifactId>
</dependency>
<dependency>
<groupId>com.alibaba.datax</groupId>
<artifactId>plugin-rdbms-util</artifactId>
<version>${datax-project-version}</version>
</dependency>
<dependency>
<groupId>org.opengauss</groupId>
<artifactId>opengauss-jdbc</artifactId>
<version>3.0.0</version>
</dependency>
</dependencies>
<build>
<plugins>
<!-- compiler plugin -->
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>${jdk-version}</source>
<target>${jdk-version}</target>
<encoding>${project-sourceEncoding}</encoding>
</configuration>
</plugin>
<!-- assembly plugin -->
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<descriptors>
<descriptor>src/main/assembly/package.xml</descriptor>
</descriptors>
<finalName>datax</finalName>
</configuration>
<executions>
<execution>
<id>dwzip</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>

View File

@ -0,0 +1,35 @@
<assembly
xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0 http://maven.apache.org/xsd/assembly-1.1.0.xsd">
<id></id>
<formats>
<format>dir</format>
</formats>
<includeBaseDirectory>false</includeBaseDirectory>
<fileSets>
<fileSet>
<directory>src/main/resources</directory>
<includes>
<include>plugin.json</include>
<include>plugin_job_template.json</include>
</includes>
<outputDirectory>plugin/reader/gaussdbreader</outputDirectory>
</fileSet>
<fileSet>
<directory>target/</directory>
<includes>
<include>gaussdbreader-0.0.1-SNAPSHOT.jar</include>
</includes>
<outputDirectory>plugin/reader/gaussdbreader</outputDirectory>
</fileSet>
</fileSets>
<dependencySets>
<dependencySet>
<useProjectArtifact>false</useProjectArtifact>
<outputDirectory>plugin/reader/gaussdbreader/libs</outputDirectory>
<scope>runtime</scope>
</dependencySet>
</dependencySets>
</assembly>

View File

@ -0,0 +1,7 @@
package com.alibaba.datax.plugin.reader.gaussdbreader;
public class Constant {
public static final int DEFAULT_FETCH_SIZE = 1000;
}

View File

@ -0,0 +1,86 @@
package com.alibaba.datax.plugin.reader.gaussdbreader;
import com.alibaba.datax.common.exception.DataXException;
import com.alibaba.datax.common.plugin.RecordSender;
import com.alibaba.datax.common.spi.Reader;
import com.alibaba.datax.common.util.Configuration;
import com.alibaba.datax.plugin.rdbms.reader.CommonRdbmsReader;
import com.alibaba.datax.plugin.rdbms.util.DBUtilErrorCode;
import com.alibaba.datax.plugin.rdbms.util.DataBaseType;
import java.util.List;
public class GaussDbReader extends Reader {
private static final DataBaseType DATABASE_TYPE = DataBaseType.GaussDB;
public static class Job extends Reader.Job {
private Configuration originalConfig;
private CommonRdbmsReader.Job commonRdbmsReaderMaster;
@Override
public void init() {
this.originalConfig = super.getPluginJobConf();
int fetchSize = this.originalConfig.getInt(com.alibaba.datax.plugin.rdbms.reader.Constant.FETCH_SIZE,
Constant.DEFAULT_FETCH_SIZE);
if (fetchSize < 1) {
throw DataXException.asDataXException(DBUtilErrorCode.REQUIRED_VALUE,
String.format("您配置的fetchSize有误根据DataX的设计fetchSize : [%d] 设置值不能小于 1.", fetchSize));
}
this.originalConfig.set(com.alibaba.datax.plugin.rdbms.reader.Constant.FETCH_SIZE, fetchSize);
this.commonRdbmsReaderMaster = new CommonRdbmsReader.Job(DATABASE_TYPE);
this.commonRdbmsReaderMaster.init(this.originalConfig);
}
@Override
public List<Configuration> split(int adviceNumber) {
return this.commonRdbmsReaderMaster.split(this.originalConfig, adviceNumber);
}
@Override
public void post() {
this.commonRdbmsReaderMaster.post(this.originalConfig);
}
@Override
public void destroy() {
this.commonRdbmsReaderMaster.destroy(this.originalConfig);
}
}
public static class Task extends Reader.Task {
private Configuration readerSliceConfig;
private CommonRdbmsReader.Task commonRdbmsReaderSlave;
@Override
public void init() {
this.readerSliceConfig = super.getPluginJobConf();
this.commonRdbmsReaderSlave = new CommonRdbmsReader.Task(DATABASE_TYPE,super.getTaskGroupId(), super.getTaskId());
this.commonRdbmsReaderSlave.init(this.readerSliceConfig);
}
@Override
public void startRead(RecordSender recordSender) {
int fetchSize = this.readerSliceConfig.getInt(com.alibaba.datax.plugin.rdbms.reader.Constant.FETCH_SIZE);
this.commonRdbmsReaderSlave.startRead(this.readerSliceConfig, recordSender,
super.getTaskPluginCollector(), fetchSize);
}
@Override
public void post() {
this.commonRdbmsReaderSlave.post(this.readerSliceConfig);
}
@Override
public void destroy() {
this.commonRdbmsReaderSlave.destroy(this.readerSliceConfig);
}
}
}

View File

@ -0,0 +1,6 @@
{
"name": "gaussdbreader",
"class": "com.alibaba.datax.plugin.reader.gaussdbreader.GaussDbReader",
"description": "useScene: prod. mechanism: Jdbc connection using the database, execute select sql, retrieve data from the ResultSet. warn: The more you know about the database, the less problems you encounter.",
"developer": "alibaba"
}

View File

@ -0,0 +1,13 @@
{
"name": "gaussdbreader",
"parameter": {
"username": "",
"password": "",
"connection": [
{
"table": [],
"jdbcUrl": []
}
]
}
}

View File

@ -0,0 +1,267 @@
# DataX GaussDbWriter
---
## 1 快速介绍
GaussDbWriter插件实现了写入数据到 GaussDB主库目的表的功能。在底层实现上GaussDbWriter通过JDBC连接远程 GaussDB 数据库,并执行相应的 insert into ... sql 语句将数据写入 GaussDB内部会分批次提交入库。
GaussDbWriter面向ETL开发工程师他们使用GaussDbWriter从数仓导入数据到GaussDB。同时 GaussDbWriter亦可以作为数据迁移工具为DBA等用户提供服务。
## 2 实现原理
GaussDbWriter通过 DataX 框架获取 Reader 生成的协议数据根据你配置生成相应的SQL插入语句
* `insert into...`(当主键/唯一性索引冲突时会写不进去冲突的行)
<br />
注意:
1. 目的表所在数据库必须是主库才能写入数据;整个任务至少需具备 insert into...的权限,是否需要其他权限,取决于你任务配置中在 preSql 和 postSql 中指定的语句。
2. GaussDbWriter和MysqlWriter不同不支持配置writeMode参数。
## 3 功能说明
### 3.1 配置样例
* 这里使用一份从内存产生到 GaussDbWriter导入的数据。
```json
{
"job": {
"setting": {
"speed": {
"channel": 1
}
},
"content": [
{
"reader": {
"name": "streamreader",
"parameter": {
"column" : [
{
"value": "DataX",
"type": "string"
},
{
"value": 19880808,
"type": "long"
},
{
"value": "1988-08-08 08:08:08",
"type": "date"
},
{
"value": true,
"type": "bool"
},
{
"value": "test",
"type": "bytes"
}
],
"sliceRecordCount": 1000
}
},
"writer": {
"name": "gaussdbwriter",
"parameter": {
"username": "xx",
"password": "xx",
"column": [
"id",
"name"
],
"preSql": [
"delete from test"
],
"connection": [
{
"jdbcUrl": "jdbc:opengauss://127.0.0.1:3002/datax",
"table": [
"test"
]
}
]
}
}
}
]
}
}
```
### 3.2 参数说明
* **jdbcUrl**
* 描述:目的数据库的 JDBC 连接信息 ,jdbcUrl必须包含在connection配置单元中。
注意1、在一个数据库上只能配置一个值。
2、jdbcUrl按照GaussDB官方规范并可以填写连接附加参数信息。具体请参看GaussDB官方文档或者咨询对应 DBA。
* 必选:是 <br />
* 默认值:无 <br />
* **username**
* 描述:目的数据库的用户名 <br />
* 必选:是 <br />
* 默认值:无 <br />
* **password**
* 描述:目的数据库的密码 <br />
* 必选:是 <br />
* 默认值:无 <br />
* **table**
* 描述:目的表的表名称。支持写入一个或者多个表。当配置为多张表时,必须确保所有表结构保持一致。
注意table 和 jdbcUrl 必须包含在 connection 配置单元中
* 必选:是 <br />
* 默认值:无 <br />
* **column**
* 描述:目的表需要写入数据的字段,字段之间用英文逗号分隔。例如: "column": ["id","name","age"]。如果要依次写入全部列,使用\*表示, 例如: "column": ["\*"]
注意1、我们强烈不推荐你这样配置因为当你目的表字段个数、类型等有改动时你的任务可能运行不正确或者失败
2、此处 column 不能配置任何常量值
* 必选:是 <br />
* 默认值:否 <br />
* **preSql**
* 描述:写入数据到目的表前,会先执行这里的标准语句。如果 Sql 中有你需要操作到的表名称,请使用 `@table` 表示,这样在实际执行 Sql 语句时会对变量按照实际表名称进行替换。比如你的任务是要写入到目的端的100个同构分表(表名称为:datax_00,datax01, ... datax_98,datax_99),并且你希望导入数据前,先对表中数据进行删除操作,那么你可以这样配置:`"preSql":["delete from @table"]`,效果是:在执行到每个表写入数据前,会先执行对应的 delete from 对应表名称 <br />
* 必选:否 <br />
* 默认值:无 <br />
* **postSql**
* 描述:写入数据到目的表后,会执行这里的标准语句。(原理同 preSql <br />
* 必选:否 <br />
* 默认值:无 <br />
* **batchSize**
* 描述一次性批量提交的记录数大小该值可以极大减少DataX与GaussDB的网络交互次数并提升整体吞吐量。但是该值设置过大可能会造成DataX运行进程OOM情况。<br />
* 必选:否 <br />
* 默认值1024 <br />
### 3.3 类型转换
目前 GaussDbWriter支持大部分 GaussDB类型但也存在部分没有支持的情况请注意检查你的类型。
下面列出 GaussDbWriter针对 GaussDB类型转换列表:
| DataX 内部类型| GaussDB 数据类型 |
| -------- | ----- |
| Long |bigint, bigserial, integer, smallint, serial |
| Double |double precision, money, numeric, real |
| String |varchar, char, text, bit|
| Date |date, time, timestamp |
| Boolean |bool|
| Bytes |bytea|
## 4 性能报告
### 4.1 环境准备
#### 4.1.1 数据特征
建表语句:
create table pref_test(
id serial,
a_bigint bigint,
a_bit bit(10),
a_boolean boolean,
a_char character(5),
a_date date,
a_double double precision,
a_integer integer,
a_money money,
a_num numeric(10,2),
a_real real,
a_smallint smallint,
a_text text,
a_time time,
a_timestamp timestamp
)
#### 4.1.2 机器参数
* 执行DataX的机器参数为:
1. cpu: 16核 Intel(R) Xeon(R) CPU E5620 @ 2.40GHz
2. mem: MemTotal: 24676836kB MemFree: 6365080kB
3. net: 百兆双网卡
* GaussDB数据库机器参数为:
D12 24逻辑核 192G内存 12*480G SSD 阵列
### 4.2 测试报告
#### 4.2.1 单表测试报告
| 通道数| 批量提交batchSize | DataX速度(Rec/s)| DataX流量(M/s) | DataX机器运行负载
|--------|--------| --------|--------|--------|--------|
|1| 128 | 9259 | 0.55 | 0.3
|1| 512 | 10869 | 0.653 | 0.3
|1| 2048 | 9803 | 0.589 | 0.8
|4| 128 | 30303 | 1.82 | 1
|4| 512 | 36363 | 2.18 | 1
|4| 2048 | 36363 | 2.18 | 1
|8| 128 | 57142 | 3.43 | 2
|8| 512 | 66666 | 4.01 | 1.5
|8| 2048 | 66666 | 4.01 | 1.1
|16| 128 | 88888 | 5.34 | 1.8
|16| 2048 | 94117 | 5.65 | 2.5
|32| 512 | 76190 | 4.58 | 3
#### 4.2.2 性能测试小结
1. `channel数对性能影响很大`
2. `通常不建议写入数据库时,通道个数 > 32`
## FAQ
***
**Q: GaussDbWriter 执行 postSql 语句报错,那么数据导入到目标数据库了吗?**
A: DataX 导入过程存在三块逻辑pre 操作、导入操作、post 操作其中任意一环报错DataX 作业报错。由于 DataX 不能保证在同一个事务完成上述几个操作,因此有可能数据已经落入到目标端。
***
**Q: 按照上述说法,那么有部分脏数据导入数据库,如果影响到线上数据库怎么办?**
A: 目前有两种解法,第一种配置 pre 语句,该 sql 可以清理当天导入数据, DataX 每次导入时候可以把上次清理干净并导入完整数据。
第二种,向临时表导入数据,完成后再 rename 到线上表。
***

86
gaussdbwriter/pom.xml Normal file
View File

@ -0,0 +1,86 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>datax-all</artifactId>
<groupId>com.alibaba.datax</groupId>
<version>0.0.1-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>gaussdbwriter</artifactId>
<name>gaussdbwriter</name>
<packaging>jar</packaging>
<dependencies>
<dependency>
<groupId>com.alibaba.datax</groupId>
<artifactId>datax-common</artifactId>
<version>${datax-project-version}</version>
<exclusions>
<exclusion>
<artifactId>slf4j-log4j12</artifactId>
<groupId>org.slf4j</groupId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
</dependency>
<dependency>
<groupId>ch.qos.logback</groupId>
<artifactId>logback-classic</artifactId>
</dependency>
<dependency>
<groupId>com.alibaba.datax</groupId>
<artifactId>plugin-rdbms-util</artifactId>
<version>${datax-project-version}</version>
</dependency>
<dependency>
<groupId>org.opengauss</groupId>
<artifactId>opengauss-jdbc</artifactId>
<version>3.0.0</version>
</dependency>
</dependencies>
<build>
<plugins>
<!-- compiler plugin -->
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>${jdk-version}</source>
<target>${jdk-version}</target>
<encoding>${project-sourceEncoding}</encoding>
</configuration>
</plugin>
<!-- assembly plugin -->
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<descriptors>
<descriptor>src/main/assembly/package.xml</descriptor>
</descriptors>
<finalName>datax</finalName>
</configuration>
<executions>
<execution>
<id>dwzip</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>

View File

@ -0,0 +1,35 @@
<assembly
xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0 http://maven.apache.org/xsd/assembly-1.1.0.xsd">
<id></id>
<formats>
<format>dir</format>
</formats>
<includeBaseDirectory>false</includeBaseDirectory>
<fileSets>
<fileSet>
<directory>src/main/resources</directory>
<includes>
<include>plugin.json</include>
<include>plugin_job_template.json</include>
</includes>
<outputDirectory>plugin/writer/gaussdbwriter</outputDirectory>
</fileSet>
<fileSet>
<directory>target/</directory>
<includes>
<include>gaussdbwriter-0.0.1-SNAPSHOT.jar</include>
</includes>
<outputDirectory>plugin/writer/gaussdbwriter</outputDirectory>
</fileSet>
</fileSets>
<dependencySets>
<dependencySet>
<useProjectArtifact>false</useProjectArtifact>
<outputDirectory>plugin/writer/gaussdbwriter/libs</outputDirectory>
<scope>runtime</scope>
</dependencySet>
</dependencySets>
</assembly>

View File

@ -0,0 +1,103 @@
package com.alibaba.datax.plugin.reader.gaussdbwriter;
import com.alibaba.datax.common.exception.DataXException;
import com.alibaba.datax.common.plugin.RecordReceiver;
import com.alibaba.datax.common.spi.Writer;
import com.alibaba.datax.common.util.Configuration;
import com.alibaba.datax.plugin.rdbms.util.DBUtilErrorCode;
import com.alibaba.datax.plugin.rdbms.util.DataBaseType;
import com.alibaba.datax.plugin.rdbms.writer.CommonRdbmsWriter;
import com.alibaba.datax.plugin.rdbms.writer.Key;
import java.util.List;
public class GaussDbWriter extends Writer {
private static final DataBaseType DATABASE_TYPE = DataBaseType.GaussDB;
public static class Job extends Writer.Job {
private Configuration originalConfig = null;
private CommonRdbmsWriter.Job commonRdbmsWriterMaster;
@Override
public void init() {
this.originalConfig = super.getPluginJobConf();
// warnnot like mysql, GaussDB only support insert mode, don't use
String writeMode = this.originalConfig.getString(Key.WRITE_MODE);
if (null != writeMode) {
throw DataXException.asDataXException(DBUtilErrorCode.CONF_ERROR,
String.format("写入模式(writeMode)配置有误. 因为GaussDB不支持配置参数项 writeMode: %s, GaussDB仅使用insert sql 插入数据. 请检查您的配置并作出修改.", writeMode));
}
this.commonRdbmsWriterMaster = new CommonRdbmsWriter.Job(DATABASE_TYPE);
this.commonRdbmsWriterMaster.init(this.originalConfig);
}
@Override
public void prepare() {
this.commonRdbmsWriterMaster.prepare(this.originalConfig);
}
@Override
public List<Configuration> split(int mandatoryNumber) {
return this.commonRdbmsWriterMaster.split(this.originalConfig, mandatoryNumber);
}
@Override
public void post() {
this.commonRdbmsWriterMaster.post(this.originalConfig);
}
@Override
public void destroy() {
this.commonRdbmsWriterMaster.destroy(this.originalConfig);
}
}
public static class Task extends Writer.Task {
private Configuration writerSliceConfig;
private CommonRdbmsWriter.Task commonRdbmsWriterSlave;
@Override
public void init() {
this.writerSliceConfig = super.getPluginJobConf();
this.commonRdbmsWriterSlave = new CommonRdbmsWriter.Task(DATABASE_TYPE){
@Override
public String calcValueHolder(String columnType){
if("serial".equalsIgnoreCase(columnType)){
return "?::int";
}else if("bigserial".equalsIgnoreCase(columnType)){
return "?::int8";
}else if("bit".equalsIgnoreCase(columnType)){
return "?::bit varying";
}
return "?::" + columnType;
}
};
this.commonRdbmsWriterSlave.init(this.writerSliceConfig);
}
@Override
public void prepare() {
this.commonRdbmsWriterSlave.prepare(this.writerSliceConfig);
}
public void startWrite(RecordReceiver recordReceiver) {
this.commonRdbmsWriterSlave.startWrite(recordReceiver, this.writerSliceConfig, super.getTaskPluginCollector());
}
@Override
public void post() {
this.commonRdbmsWriterSlave.post(this.writerSliceConfig);
}
@Override
public void destroy() {
this.commonRdbmsWriterSlave.destroy(this.writerSliceConfig);
}
}
}

View File

@ -0,0 +1,6 @@
{
"name": "gaussdbwriter",
"class": "com.alibaba.datax.plugin.writer.gaussdbwriter.GaussDbWriter",
"description": "useScene: prod. mechanism: Jdbc connection using the database, execute insert sql. warn: The more you know about the database, the less problems you encounter.",
"developer": "alibaba"
}

View File

@ -0,0 +1,16 @@
{
"name": "gaussdbwriter",
"parameter": {
"username": "",
"password": "",
"column": [],
"connection": [
{
"jdbcUrl": "",
"table": []
}
],
"preSql": [],
"postSql": []
}
}

View File

@ -60,12 +60,16 @@ hbase11xsqlreader插件实现了从Phoenix(HBase SQL)读取数据。在底层实
//填写连接Phoenix的hbase集群zk地址
"hbaseConfig": {
"hbase.zookeeper.quorum": "hb-proxy-xxx-002.hbase.rds.aliyuncs.com,hb-proxy-xxx-001.hbase.rds.aliyuncs.com,hb-proxy-xxx-003.hbase.rds.aliyuncs.com"
},
},
//填写要读取的phoenix的命名空间
"schema": "TAG",
//填写要读取的phoenix的表名
"table": "US_POPULATION",
//填写要读取的列名,不填读取所有列
"column": [
]
],
//查询条件
"where": "id="
}
},
"writer": {
@ -92,11 +96,18 @@ hbase11xsqlreader插件实现了从Phoenix(HBase SQL)读取数据。在底层实
* 必选:是 <br />
* 默认值:无 <br />
* **schema**
* 描述编写Phoenix中的namespace该值设置为''
* 必选:是 <br />
* 默认值:无 <br />
* **table**
* 描述编写Phoenix中的表名,如果有namespace该值设置为'namespace.tablename'
* 描述编写Phoenix中的表名该值设置为'tablename'
* 必选:是 <br />
@ -109,7 +120,13 @@ hbase11xsqlreader插件实现了从Phoenix(HBase SQL)读取数据。在底层实
* 必选:是 <br />
* 默认值:无 <br />
* **where**
* 描述填写需要从phoenix表中读取条件判断。
* 可选:是 <br />
* 默认值:无 <br />
### 3.3 类型转换
@ -172,11 +189,14 @@ hbase11xsqlreader插件实现了从Phoenix(HBase SQL)读取数据。在底层实
"hbaseConfig": {
"hbase.zookeeper.quorum": "hb-proxy-xxx-002.hbase.rds.aliyuncs.com,hb-proxy-xxx-001.hbase.rds.aliyuncs.com,hb-proxy-xxx-003.hbase.rds.aliyuncs.com"
},
"schema": "TAG",
//填写要读取的phoenix的表名
"table": "US_POPULATION",
//填写要读取的列名,不填读取所有列
"column": [
]
],
//查询条件
"where": "id="
}
},
"writer": {
@ -204,7 +224,13 @@ hbase11xsqlreader插件实现了从Phoenix(HBase SQL)读取数据。在底层实
* 必选:是 <br />
* 默认值:无 <br />
* **schema**
* 描述编写Phoenix中的namespace该值设置为''
* 必选:是 <br />
* 默认值:无 <br />
* **table**
* 描述编写Phoenix中的表名,如果有namespace该值设置为'namespace.tablename'
@ -220,7 +246,13 @@ hbase11xsqlreader插件实现了从Phoenix(HBase SQL)读取数据。在底层实
* 必选:是 <br />
* 默认值:无 <br />
* **where**
* 描述填写需要从phoenix表中读取条件判断。
* 可选:是 <br />
* 默认值:无 <br />
### 3.3 类型转换

View File

@ -26,9 +26,7 @@ import java.io.IOException;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.*;
public class HbaseSQLHelper {
@ -50,11 +48,15 @@ public class HbaseSQLHelper {
String zkUrl = readerConfig.getZkUrl();
PhoenixConfigurationUtil.setInputClass(conf, PhoenixRecordWritable.class);
PhoenixConfigurationUtil.setInputTableName(conf, table);
PhoenixConfigurationUtil.setInputTableName(conf, readerConfig.getSchema()+"."+table);
if (!columns.isEmpty()) {
PhoenixConfigurationUtil.setSelectColumnNames(conf, columns.toArray(new String[columns.size()]));
}
if(Objects.nonNull(readerConfig.getWhere())){
PhoenixConfigurationUtil.setInputTableConditions(conf,readerConfig.getWhere());
}
PhoenixEmbeddedDriver.ConnectionInfo info = null;
try {
info = PhoenixEmbeddedDriver.ConnectionInfo.create(zkUrl);
@ -67,15 +69,19 @@ public class HbaseSQLHelper {
conf.setInt(HConstants.ZOOKEEPER_CLIENT_PORT, info.getPort());
if (info.getRootNode() != null)
conf.set(HConstants.ZOOKEEPER_ZNODE_PARENT, info.getRootNode());
conf.set(Key.NAME_SPACE_MAPPING_ENABLED,"true");
conf.set(Key.SYSTEM_TABLES_TO_NAMESPACE,"true");
return conf;
}
public static List<String> getPColumnNames(String connectionString, String tableName) throws SQLException {
Connection con =
DriverManager.getConnection(connectionString);
public static List<String> getPColumnNames(String connectionString, String tableName,String schema) throws SQLException {
Properties pro = new Properties();
pro.put(Key.NAME_SPACE_MAPPING_ENABLED, true);
pro.put(Key.SYSTEM_TABLES_TO_NAMESPACE, true);
Connection con = DriverManager.getConnection(connectionString,pro);
PhoenixConnection phoenixConnection = con.unwrap(PhoenixConnection.class);
MetaDataClient metaDataClient = new MetaDataClient(phoenixConnection);
PTable table = metaDataClient.updateCache("", tableName).getTable();
PTable table = metaDataClient.updateCache(schema, tableName).getTable();
List<String> columnNames = new ArrayList<String>();
for (PColumn pColumn : table.getColumns()) {
if (!pColumn.getName().getString().equals(SaltingUtil.SALTING_COLUMN_NAME))

View File

@ -9,6 +9,7 @@ import org.slf4j.LoggerFactory;
import java.sql.SQLException;
import java.util.List;
import java.util.StringJoiner;
public class HbaseSQLReaderConfig {
private final static Logger LOG = LoggerFactory.getLogger(HbaseSQLReaderConfig.class);
@ -27,6 +28,9 @@ public class HbaseSQLReaderConfig {
private String tableName;
private List<String> columns; // 目的表的所有列的列名包括主键和非主键不包括时间列
private String where;//条件
private String schema;//
/**
* @return 获取原始的datax配置
*/
@ -96,22 +100,27 @@ public class HbaseSQLReaderConfig {
}
String zkQuorum = zkCfg.getFirst();
String znode = zkCfg.getSecond();
if (zkQuorum == null || zkQuorum.isEmpty()) {
throw DataXException.asDataXException(
HbaseSQLReaderErrorCode.ILLEGAL_VALUE, "HBase的hbase.zookeeper.quorum配置不能为空" );
}
// 生成sql使用的连接字符串 格式 jdbc:hbase:zk_quorum:2181:/znode_parent
cfg.connectionString = "jdbc:phoenix:" + zkQuorum;
cfg.zkUrl = zkQuorum + ":2181";
StringBuilder connectionString=new StringBuilder("jdbc:phoenix:");
connectionString.append(zkQuorum);
cfg.connectionString = connectionString.toString();
StringBuilder zkUrl =new StringBuilder(zkQuorum);
cfg.zkUrl = zkUrl.append(":2181").toString();
if (!znode.isEmpty()) {
cfg.connectionString += cfg.connectionString + ":" + znode;
cfg.zkUrl += cfg.zkUrl + ":" + znode;
cfg.connectionString = connectionString.append(":").append(znode).toString();
cfg.zkUrl=zkUrl.append(":").append(znode).toString();
}
}
private static void parseTableConfig(HbaseSQLReaderConfig cfg, Configuration dataxCfg) {
// 解析并检查表名
cfg.tableName = dataxCfg.getString(Key.TABLE);
cfg.schema = dataxCfg.getString(Key.SCHEMA);
if (cfg.tableName == null || cfg.tableName.isEmpty()) {
throw DataXException.asDataXException(
HbaseSQLReaderErrorCode.ILLEGAL_VALUE, "HBase的tableName配置不能为空,请检查并修改配置." );
@ -124,13 +133,14 @@ public class HbaseSQLReaderConfig {
HbaseSQLReaderErrorCode.ILLEGAL_VALUE, "您配置的tableName含有非法字符{0},请检查您的配置.");
} else if (cfg.columns.isEmpty()) {
try {
cfg.columns = HbaseSQLHelper.getPColumnNames(cfg.connectionString, cfg.tableName);
cfg.columns = HbaseSQLHelper.getPColumnNames(cfg.connectionString, cfg.tableName,cfg.schema);
dataxCfg.set(Key.COLUMN, cfg.columns);
} catch (SQLException e) {
throw DataXException.asDataXException(
HbaseSQLReaderErrorCode.GET_PHOENIX_COLUMN_ERROR, "HBase的columns配置不能为空,请添加目标表的列名配置." + e.getMessage(), e);
}
}
cfg.where=dataxCfg.getString(Key.WHERE);
}
@Override
@ -151,6 +161,8 @@ public class HbaseSQLReaderConfig {
ret.append(",");
}
ret.setLength(ret.length() - 1);
ret.append("[where=]").append(getWhere());
ret.append("[schema=]").append(getSchema());
ret.append("\n");
return ret.toString();
@ -161,4 +173,20 @@ public class HbaseSQLReaderConfig {
*/
private HbaseSQLReaderConfig() {
}
public String getWhere() {
return where;
}
public void setWhere(String where) {
this.where = where;
}
public String getSchema() {
return schema;
}
public void setSchema(String schema) {
this.schema = schema;
}
}

View File

@ -19,10 +19,8 @@ import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.math.BigDecimal;
import java.sql.*;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.sql.Date;
import java.util.*;
/**
* Created by admin on 1/3/18.
@ -42,11 +40,14 @@ public class HbaseSQLReaderTask {
}
private void getPColumns() throws SQLException {
Properties pro = new Properties();
pro.put(Key.NAME_SPACE_MAPPING_ENABLED, true);
pro.put(Key.SYSTEM_TABLES_TO_NAMESPACE, true);
Connection con =
DriverManager.getConnection(this.readerConfig.getConnectionString());
DriverManager.getConnection(this.readerConfig.getConnectionString(),pro);
PhoenixConnection phoenixConnection = con.unwrap(PhoenixConnection.class);
MetaDataClient metaDataClient = new MetaDataClient(phoenixConnection);
PTable table = metaDataClient.updateCache("", this.readerConfig.getTableName()).getTable();
PTable table = metaDataClient.updateCache(this.readerConfig.getSchema(), this.readerConfig.getTableName()).getTable();
List<String> columnNames = this.readerConfig.getColumns();
for (PColumn pColumn : table.getColumns()) {
if (columnNames.contains(pColumn.getName().getString())) {

View File

@ -24,5 +24,18 @@ public final class Key {
* 必选列配置
*/
public final static String COLUMN = "column";
/**
*
*/
public static final String WHERE = "where";
/**
* 可选Phoenix表所属schema默认为空
*/
public static final String SCHEMA = "schema";
public static final String NAME_SPACE_MAPPING_ENABLED = "phoenix.schema.isNamespaceMappingEnabled";
public static final String SYSTEM_TABLES_TO_NAMESPACE = "phoenix.schema.mapSystemTablesToNamespace";
}

View File

@ -1,5 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>datax-all</artifactId>
<groupId>com.alibaba.datax</groupId>
@ -111,6 +112,42 @@
<version>${datax-project-version}</version>
</dependency>
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-column</artifactId>
<version>1.12.0</version>
</dependency>
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-avro</artifactId>
<version>1.12.0</version>
</dependency>
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-common</artifactId>
<version>1.12.0</version>
</dependency>
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-format</artifactId>
<version>2.3.0</version>
</dependency>
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-jackson</artifactId>
<version>1.12.0</version>
</dependency>
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-encoding</artifactId>
<version>1.12.0</version>
</dependency>
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-hadoop</artifactId>
<version>1.12.0</version>
</dependency>
</dependencies>
<build>

View File

@ -37,6 +37,28 @@
<!--</includes>-->
<!--<outputDirectory>plugin/reader/hdfsreader/libs</outputDirectory>-->
<!--</fileSet>-->
<!--<fileSet>-->
<!--<directory>src/main/libs</directory>-->
<!--<includes>-->
<!--<include>*.*</include>-->
<!--</includes>-->
<!--<outputDirectory>plugin/reader/hdfsreader/libs</outputDirectory>-->
<!--</fileSet>-->
<fileSet>
<directory>src/main/libs</directory>
<includes>
<include>*.*</include>
</includes>
<outputDirectory>plugin/reader/ossreader/libs</outputDirectory>
</fileSet>
<fileSet>
<directory>src/main/libs</directory>
<includes>
<include>*.*</include>
</includes>
<outputDirectory>plugin/reader/hivereader/libs</outputDirectory>
</fileSet>
</fileSets>
<dependencySets>

View File

@ -10,4 +10,5 @@ public class Constant {
public static final String CSV = "CSV";
public static final String SEQ = "SEQ";
public static final String RC = "RC";
public static final String PARQUET = "PARQUET";
}

View File

@ -9,12 +9,16 @@ import com.alibaba.datax.plugin.unstructuredstorage.reader.ColumnEntry;
import com.alibaba.datax.plugin.unstructuredstorage.reader.UnstructuredStorageReaderErrorCode;
import com.alibaba.datax.plugin.unstructuredstorage.reader.UnstructuredStorageReaderUtil;
import com.alibaba.fastjson2.JSON;
import com.alibaba.fastjson2.JSONArray;
import com.alibaba.fastjson2.JSONObject;
import org.apache.commons.lang3.BooleanUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.exception.ExceptionUtils;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.type.HiveDecimal;
import org.apache.hadoop.hive.ql.io.RCFile;
import org.apache.hadoop.hive.ql.io.RCFileRecordReader;
import org.apache.hadoop.hive.ql.io.orc.OrcFile;
@ -29,14 +33,30 @@ import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.parquet.example.data.Group;
import org.apache.parquet.hadoop.ParquetReader;
import org.apache.parquet.hadoop.example.GroupReadSupport;
import org.apache.parquet.hadoop.util.HadoopInputFile;
import org.apache.parquet.io.api.Binary;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.MessageTypeParser;
import org.apache.parquet.schema.PrimitiveType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.sql.Timestamp;
import java.text.SimpleDateFormat;
import java.time.LocalDate;
import java.time.LocalDateTime;
import java.time.LocalTime;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;
/**
* Created by mingya.wmy on 2015/8/12.
@ -56,6 +76,10 @@ public class DFSUtil {
public static final String HDFS_DEFAULTFS_KEY = "fs.defaultFS";
public static final String HADOOP_SECURITY_AUTHENTICATION_KEY = "hadoop.security.authentication";
private Boolean skipEmptyOrcFile = false;
private Integer orcFileEmptySize = null;
public DFSUtil(Configuration taskConfig) {
hadoopConf = new org.apache.hadoop.conf.Configuration();
@ -79,6 +103,7 @@ public class DFSUtil {
this.hadoopConf.set(HADOOP_SECURITY_AUTHENTICATION_KEY, "kerberos");
}
this.kerberosAuthentication(this.kerberosPrincipal, this.kerberosKeytabFilePath);
this.skipEmptyOrcFile = taskConfig.getBool(Key.SKIP_EMPTY_ORCFILE, false);
LOG.info(String.format("hadoopConfig details:%s", JSON.toJSONString(this.hadoopConf)));
}
@ -102,10 +127,11 @@ public class DFSUtil {
* @param srcPaths 路径列表
* @param specifiedFileType 指定文件类型
*/
public HashSet<String> getAllFiles(List<String> srcPaths, String specifiedFileType) {
public HashSet<String> getAllFiles(List<String> srcPaths, String specifiedFileType, Boolean skipEmptyOrcFile, Integer orcFileEmptySize) {
this.specifiedFileType = specifiedFileType;
this.skipEmptyOrcFile = skipEmptyOrcFile;
this.orcFileEmptySize = orcFileEmptySize;
if (!srcPaths.isEmpty()) {
for (String eachPath : srcPaths) {
LOG.info(String.format("get HDFS all files in path = [%s]", eachPath));
@ -127,9 +153,13 @@ public class DFSUtil {
FileStatus stats[] = hdfs.globStatus(path);
for (FileStatus f : stats) {
if (f.isFile()) {
if (f.getLen() == 0) {
long fileLength = f.getLen();
if (fileLength == 0) {
String message = String.format("文件[%s]长度为0将会跳过不作处理", hdfsPath);
LOG.warn(message);
} else if (BooleanUtils.isTrue(this.skipEmptyOrcFile) && this.orcFileEmptySize != null && fileLength <= this.orcFileEmptySize) {
String message = String.format("The orc file [%s] is empty, file size: %s, DataX will skip it !", f.getPath().toString(), fileLength);
LOG.warn(message);
} else {
addSourceFileByType(f.getPath().toString());
}
@ -167,7 +197,16 @@ public class DFSUtil {
LOG.info(String.format("[%s] 是目录, 递归获取该目录下的文件", f.getPath().toString()));
getHDFSAllFilesNORegex(f.getPath().toString(), hdfs);
} else if (f.isFile()) {
long fileLength = f.getLen();
if (fileLength == 0) {
String message = String.format("The file [%s] is empty, DataX will skip it !", f.getPath().toString());
LOG.warn(message);
continue;
} else if (BooleanUtils.isTrue(this.skipEmptyOrcFile) && this.orcFileEmptySize != null && fileLength <= this.orcFileEmptySize) {
String message = String.format("The orc file [%s] is empty, file size: %s, DataX will skip it !", f.getPath().toString(), fileLength);
LOG.warn(message);
continue;
}
addSourceFileByType(f.getPath().toString());
} else {
String message = String.format("该路径[%s]文件类型既不是目录也不是文件,插件自动忽略。",
@ -332,7 +371,19 @@ public class DFSUtil {
//Each file as a split
//TODO multy threads
// OrcInputFormat getSplits params numSplits not used, splits size = block numbers
InputSplit[] splits = in.getSplits(conf, -1);
InputSplit[] splits;
try {
splits = in.getSplits(conf, 1);
} catch (Exception splitException) {
if (Boolean.TRUE.equals(this.skipEmptyOrcFile)) {
boolean isOrcFileEmptyException = checkIsOrcEmptyFileExecption(splitException);
if (isOrcFileEmptyException) {
LOG.info("skipEmptyOrcFile: true, \"{}\" is an empty orc file, skip it!", sourceOrcFilePath);
return;
}
}
throw splitException;
}
for (InputSplit split : splits) {
{
RecordReader reader = in.getRecordReader(split, conf, Reporter.NULL);
@ -349,8 +400,11 @@ public class DFSUtil {
Object field = inspector.getStructFieldData(value, fields.get(i));
recordFields.add(field);
}
List<ColumnEntry> hivePartitionColumnEntrys = UnstructuredStorageReaderUtil.getListColumnEntry(readerSliceConfig, com.alibaba.datax.plugin.unstructuredstorage.reader.Key.HIVE_PARTION_COLUMN);
ArrayList<Column> hivePartitionColumns = new ArrayList<>();
hivePartitionColumns = UnstructuredStorageReaderUtil.getHivePartitionColumns(sourceOrcFilePath, hivePartitionColumnEntrys);
transportOneRecord(column, recordFields, recordSender,
taskPluginCollector, isReadAllColumns, nullFormat);
taskPluginCollector, isReadAllColumns, nullFormat,hivePartitionColumns);
}
reader.close();
}
@ -367,8 +421,20 @@ public class DFSUtil {
}
}
private boolean checkIsOrcEmptyFileExecption(Exception e) {
if (e == null) {
return false;
}
String fullStackTrace = ExceptionUtils.getStackTrace(e);
if (fullStackTrace.contains("org.apache.orc.impl.ReaderImpl.getRawDataSizeOfColumn") && fullStackTrace.contains("Caused by: java.lang.IndexOutOfBoundsException: Index: 1, Size: 1")) {
return true;
}
return false;
}
private Record transportOneRecord(List<ColumnEntry> columnConfigs, List<Object> recordFields
, RecordSender recordSender, TaskPluginCollector taskPluginCollector, boolean isReadAllColumns, String nullFormat) {
, RecordSender recordSender, TaskPluginCollector taskPluginCollector, boolean isReadAllColumns, String nullFormat, ArrayList<Column> hiveParitionColumns) {
Record record = recordSender.createRecord();
Column columnGenerated;
try {
@ -555,8 +621,9 @@ public class DFSUtil {
} else if (StringUtils.equalsIgnoreCase(specifiedFileType, Constant.SEQ)) {
return isSequenceFile(filepath, in);
} else if (StringUtils.equalsIgnoreCase(specifiedFileType, Constant.PARQUET)) {
return true;
}
} catch (Exception e) {
String message = String.format("检查文件[%s]类型失败目前支持ORC,SEQUENCE,RCFile,TEXT,CSV五种格式的文件," +
"请检查您文件类型和文件是否正确。", filepath);
@ -693,4 +760,332 @@ public class DFSUtil {
return false;
}
public void parquetFileStartRead(String sourceParquetFilePath, Configuration readerSliceConfig, RecordSender recordSender, TaskPluginCollector taskPluginCollector) {
String schemaString = readerSliceConfig.getString(Key.PARQUET_SCHEMA);
if (StringUtils.isNotBlank(schemaString)) {
LOG.info("You config parquet schema, use it {}", schemaString);
} else {
schemaString = getParquetSchema(sourceParquetFilePath, hadoopConf);
LOG.info("Parquet schema parsed from: {} , schema is {}", sourceParquetFilePath, schemaString);
if (StringUtils.isBlank(schemaString)) {
throw DataXException.asDataXException("ParquetSchema is required, please check your config");
}
}
MessageType parquetSchema = null;
List<org.apache.parquet.schema.Type> parquetTypes = null;
Map<String, ParquetMeta> parquetMetaMap = null;
int fieldCount = 0;
try {
parquetSchema = MessageTypeParser.parseMessageType(schemaString);
fieldCount = parquetSchema.getFieldCount();
parquetTypes = parquetSchema.getFields();
parquetMetaMap = ParquetMessageHelper.parseParquetTypes(parquetTypes);
} catch (Exception e) {
String message = String.format("Error parsing to MessageType via Schema string [%s]", schemaString);
LOG.error(message);
throw DataXException.asDataXException(HdfsReaderErrorCode.PARSE_MESSAGE_TYPE_FROM_SCHEMA_ERROR, e);
}
List<ColumnEntry> column = UnstructuredStorageReaderUtil.getListColumnEntry(readerSliceConfig, com.alibaba.datax.plugin.unstructuredstorage.reader.Key.COLUMN);
String nullFormat = readerSliceConfig.getString(com.alibaba.datax.plugin.unstructuredstorage.reader.Key.NULL_FORMAT);
boolean isUtcTimestamp = readerSliceConfig.getBool(Key.PARQUET_UTC_TIMESTAMP, false);
boolean isReadAllColumns = (column == null || column.size() == 0) ? true : false;
LOG.info("ReadingAllColums: " + isReadAllColumns);
/**
* 支持 hive 表中间加列场景
*
* 开关默认 false hive表存在中间加列的场景打开需要根据 name排序
* 不默认打开的原因
* 1存量hdfs任务只根据 index获取字段无name字段配置
* 2中间加列场景比较少
* 3存量任务可能存在列错位的问题不能随意纠正
*/
boolean supportAddMiddleColumn = readerSliceConfig.getBool(Key.SUPPORT_ADD_MIDDLE_COLUMN, false);
boolean printNullValueException = readerSliceConfig.getBool("printNullValueException", false);
List<Integer> ignoreIndex = readerSliceConfig.getList("ignoreIndex", new ArrayList<Integer>(), Integer.class);
JobConf conf = new JobConf(hadoopConf);
ParquetReader<Group> reader = null;
try {
Path parquetFilePath = new Path(sourceParquetFilePath);
GroupReadSupport readSupport = new GroupReadSupport();
readSupport.init(conf, null, parquetSchema);
// 这里初始化parquetReader的时候会getFileSystem如果是HA集群期间会根据hadoopConfig中区加载failover类这里初始化builder带上conf
ParquetReader.Builder parquetReaderBuilder = ParquetReader.builder(readSupport, parquetFilePath);
parquetReaderBuilder.withConf(hadoopConf);
reader = parquetReaderBuilder.build();
Group g = null;
// 从文件名中解析分区信息
List<ColumnEntry> hivePartitionColumnEntrys = UnstructuredStorageReaderUtil.getListColumnEntry(readerSliceConfig, com.alibaba.datax.plugin.unstructuredstorage.reader.Key.HIVE_PARTION_COLUMN);
ArrayList<Column> hivePartitionColumns = new ArrayList<>();
hivePartitionColumns = UnstructuredStorageReaderUtil.getHivePartitionColumns(sourceParquetFilePath, hivePartitionColumnEntrys);
List<String> schemaFieldList = null;
Map<Integer, String> colNameIndexMap = null;
Map<Integer, Integer> indexMap = null;
if (supportAddMiddleColumn) {
boolean nonName = column.stream().anyMatch(columnEntry -> StringUtils.isEmpty(columnEntry.getName()));
if (nonName) {
throw new DataXException("You configured column item without name, please correct it");
}
List<org.apache.parquet.schema.Type> parquetFileFields = getParquetFileFields(parquetFilePath, hadoopConf);
schemaFieldList = parquetFileFields.stream().map(org.apache.parquet.schema.Type::getName).collect(Collectors.toList());
colNameIndexMap = new ConcurrentHashMap<>();
Map<Integer, String> finalColNameIndexMap = colNameIndexMap;
column.forEach(columnEntry -> finalColNameIndexMap.put(columnEntry.getIndex(), columnEntry.getName()));
Iterator<Map.Entry<Integer, String>> iterator = finalColNameIndexMap.entrySet().iterator();
while (iterator.hasNext()) {
Map.Entry<Integer, String> next = iterator.next();
if (!schemaFieldList.contains(next.getValue())) {
finalColNameIndexMap.remove((next.getKey()));
}
}
LOG.info("SupportAddMiddleColumn is true, fields from parquet file is {}, " +
"colNameIndexMap is {}", JSON.toJSONString(schemaFieldList), JSON.toJSONString(colNameIndexMap));
fieldCount = column.size();
indexMap = new HashMap<>();
for (int j = 0; j < fieldCount; j++) {
if (colNameIndexMap.containsKey(j)) {
int index = findIndex(schemaFieldList, findEleInMap(colNameIndexMap, j));
indexMap.put(j, index);
}
}
}
while ((g = reader.read()) != null) {
List<Object> formattedRecord = new ArrayList<Object>(fieldCount);
try {
for (int j = 0; j < fieldCount; j++) {
Object data = null;
try {
if (null != ignoreIndex && !ignoreIndex.isEmpty() && ignoreIndex.contains(j)) {
data = null;
} else {
if (supportAddMiddleColumn) {
if (!colNameIndexMap.containsKey(j)) {
formattedRecord.add(null);
continue;
} else {
data = DFSUtil.this.readFields(g, parquetTypes.get(indexMap.get(j)), indexMap.get(j), parquetMetaMap, isUtcTimestamp);
}
} else {
data = DFSUtil.this.readFields(g, parquetTypes.get(j), j, parquetMetaMap, isUtcTimestamp);
}
}
} catch (RuntimeException e) {
if (printNullValueException) {
LOG.warn(e.getMessage());
}
}
formattedRecord.add(data);
}
transportOneRecord(column, formattedRecord, recordSender, taskPluginCollector, isReadAllColumns, nullFormat, hivePartitionColumns);
} catch (Exception e) {
throw DataXException.asDataXException(HdfsReaderErrorCode.READ_PARQUET_ERROR, e);
}
}
} catch (Exception e) {
throw DataXException.asDataXException(HdfsReaderErrorCode.READ_PARQUET_ERROR, e);
} finally {
org.apache.commons.io.IOUtils.closeQuietly(reader);
}
}
private String findEleInMap(Map<Integer, String> map, Integer key) {
Iterator<Map.Entry<Integer, String>> iterator = map.entrySet().iterator();
while (iterator.hasNext()) {
Map.Entry<Integer, String> next = iterator.next();
if (key.equals(next.getKey())) {
return next.getValue();
}
}
return null;
}
private int findIndex(List<String> schemaFieldList, String colName) {
for (int i = 0; i < schemaFieldList.size(); i++) {
if (schemaFieldList.get(i).equals(colName)) {
return i;
}
}
return -1;
}
private List<org.apache.parquet.schema.Type> getParquetFileFields(Path filePath, org.apache.hadoop.conf.Configuration configuration) {
try (org.apache.parquet.hadoop.ParquetFileReader reader = org.apache.parquet.hadoop.ParquetFileReader.open(HadoopInputFile.fromPath(filePath, configuration))) {
org.apache.parquet.schema.MessageType schema = reader.getFooter().getFileMetaData().getSchema();
List<org.apache.parquet.schema.Type> fields = schema.getFields();
return fields;
} catch (IOException e) {
LOG.error("Fetch parquet field error", e);
throw new DataXException(String.format("Fetch parquet field error, msg is %s", e.getMessage()));
}
}
private String getParquetSchema(String sourceParquetFilePath, org.apache.hadoop.conf.Configuration hadoopConf) {
GroupReadSupport readSupport = new GroupReadSupport();
ParquetReader.Builder parquetReaderBuilder = ParquetReader.builder(readSupport, new Path(sourceParquetFilePath));
ParquetReader<Group> reader = null;
try {
parquetReaderBuilder.withConf(hadoopConf);
reader = parquetReaderBuilder.build();
Group g = null;
if ((g = reader.read()) != null) {
return g.getType().toString();
}
} catch (Throwable e) {
LOG.error("Inner error, getParquetSchema failed, message is {}", e.getMessage());
} finally {
org.apache.commons.io.IOUtils.closeQuietly(reader);
}
return null;
}
/**
* parquet 相关
*/
private static final int JULIAN_EPOCH_OFFSET_DAYS = 2440588;
private static final long MILLIS_IN_DAY = TimeUnit.DAYS.toMillis(1);
private static final long NANOS_PER_MILLISECOND = TimeUnit.MILLISECONDS.toNanos(1);
private long julianDayToMillis(int julianDay) {
return (julianDay - JULIAN_EPOCH_OFFSET_DAYS) * MILLIS_IN_DAY;
}
private org.apache.parquet.schema.OriginalType getOriginalType(org.apache.parquet.schema.Type type, Map<String, ParquetMeta> parquetMetaMap) {
ParquetMeta meta = parquetMetaMap.get(type.getName());
return meta.getOriginalType();
}
private org.apache.parquet.schema.PrimitiveType asPrimitiveType(org.apache.parquet.schema.Type type, Map<String, ParquetMeta> parquetMetaMap) {
ParquetMeta meta = parquetMetaMap.get(type.getName());
return meta.getPrimitiveType();
}
private Object readFields(Group g, org.apache.parquet.schema.Type type, int index, Map<String, ParquetMeta> parquetMetaMap, boolean isUtcTimestamp) {
if (this.getOriginalType(type, parquetMetaMap) == org.apache.parquet.schema.OriginalType.MAP) {
Group groupData = g.getGroup(index, 0);
List<org.apache.parquet.schema.Type> parquetTypes = groupData.getType().getFields();
JSONObject data = new JSONObject();
for (int i = 0; i < parquetTypes.size(); i++) {
int j = groupData.getFieldRepetitionCount(i);
// map key value 的对数
for (int k = 0; k < j; k++) {
Group groupDataK = groupData.getGroup(0, k);
List<org.apache.parquet.schema.Type> parquetTypesK = groupDataK.getType().getFields();
if (2 != parquetTypesK.size()) {
// warn: 不是key value成对出现
throw new RuntimeException(String.format("bad parquet map type: %s", groupData.getValueToString(index, 0)));
}
Object subDataKey = this.readFields(groupDataK, parquetTypesK.get(0), 0, parquetMetaMap, isUtcTimestamp);
Object subDataValue = this.readFields(groupDataK, parquetTypesK.get(1), 1, parquetMetaMap, isUtcTimestamp);
if (StringUtils.equalsIgnoreCase("key", parquetTypesK.get(0).getName())) {
((JSONObject) data).put(subDataKey.toString(), subDataValue);
} else {
((JSONObject) data).put(subDataValue.toString(), subDataKey);
}
}
}
return data;
} else if (this.getOriginalType(type, parquetMetaMap) == org.apache.parquet.schema.OriginalType.MAP_KEY_VALUE) {
Group groupData = g.getGroup(index, 0);
List<org.apache.parquet.schema.Type> parquetTypes = groupData.getType().getFields();
JSONObject data = new JSONObject();
for (int i = 0; i < parquetTypes.size(); i++) {
int j = groupData.getFieldRepetitionCount(i);
// map key value 的对数
for (int k = 0; k < j; k++) {
Group groupDataK = groupData.getGroup(0, k);
List<org.apache.parquet.schema.Type> parquetTypesK = groupDataK.getType().getFields();
if (2 != parquetTypesK.size()) {
// warn: 不是key value成对出现
throw new RuntimeException(String.format("bad parquet map type: %s", groupData.getValueToString(index, 0)));
}
Object subDataKey = this.readFields(groupDataK, parquetTypesK.get(0), 0, parquetMetaMap, isUtcTimestamp);
Object subDataValue = this.readFields(groupDataK, parquetTypesK.get(1), 1, parquetMetaMap, isUtcTimestamp);
if (StringUtils.equalsIgnoreCase("key", parquetTypesK.get(0).getName())) {
((JSONObject) data).put(subDataKey.toString(), subDataValue);
} else {
((JSONObject) data).put(subDataValue.toString(), subDataKey);
}
}
}
return data;
} else if (this.getOriginalType(type, parquetMetaMap) == org.apache.parquet.schema.OriginalType.LIST) {
Group groupData = g.getGroup(index, 0);
List<org.apache.parquet.schema.Type> parquetTypes = groupData.getType().getFields();
JSONArray data = new JSONArray();
for (int i = 0; i < parquetTypes.size(); i++) {
Object subData = this.readFields(groupData, parquetTypes.get(i), i, parquetMetaMap, isUtcTimestamp);
data.add(subData);
}
return data;
} else if (this.getOriginalType(type, parquetMetaMap) == org.apache.parquet.schema.OriginalType.DECIMAL) {
Binary binaryDate = g.getBinary(index, 0);
if (null == binaryDate) {
return null;
} else {
org.apache.hadoop.hive.serde2.io.HiveDecimalWritable decimalWritable = new org.apache.hadoop.hive.serde2.io.HiveDecimalWritable(binaryDate.getBytes(), this.asPrimitiveType(type, parquetMetaMap).getDecimalMetadata().getScale());
// g.getType().getFields().get(1).asPrimitiveType().getDecimalMetadata().getScale()
HiveDecimal hiveDecimal = decimalWritable.getHiveDecimal();
if (null == hiveDecimal) {
return null;
} else {
return hiveDecimal.bigDecimalValue();
}
// return decimalWritable.doubleValue();
}
} else if (this.getOriginalType(type, parquetMetaMap) == org.apache.parquet.schema.OriginalType.DATE) {
return java.sql.Date.valueOf(LocalDate.ofEpochDay(g.getInteger(index, 0)));
} else if (this.getOriginalType(type, parquetMetaMap) == org.apache.parquet.schema.OriginalType.UTF8) {
return g.getValueToString(index, 0);
} else {
if (type.isPrimitive()) {
PrimitiveType.PrimitiveTypeName primitiveTypeName = this.asPrimitiveType(type, parquetMetaMap).getPrimitiveTypeName();
if (PrimitiveType.PrimitiveTypeName.BINARY == primitiveTypeName) {
return g.getValueToString(index, 0);
} else if (PrimitiveType.PrimitiveTypeName.BOOLEAN == primitiveTypeName) {
return g.getValueToString(index, 0);
} else if (PrimitiveType.PrimitiveTypeName.DOUBLE == primitiveTypeName) {
return g.getValueToString(index, 0);
} else if (PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY == primitiveTypeName) {
return g.getValueToString(index, 0);
} else if (PrimitiveType.PrimitiveTypeName.FLOAT == primitiveTypeName) {
return g.getValueToString(index, 0);
} else if (PrimitiveType.PrimitiveTypeName.INT32 == primitiveTypeName) {
return g.getValueToString(index, 0);
} else if (PrimitiveType.PrimitiveTypeName.INT64 == primitiveTypeName) {
return g.getValueToString(index, 0);
} else if (PrimitiveType.PrimitiveTypeName.INT96 == primitiveTypeName) {
Binary dataInt96 = g.getInt96(index, 0);
if (null == dataInt96) {
return null;
} else {
ByteBuffer buf = dataInt96.toByteBuffer();
buf.order(ByteOrder.LITTLE_ENDIAN);
long timeOfDayNanos = buf.getLong();
int julianDay = buf.getInt();
if (isUtcTimestamp) {
// UTC
LocalDate localDate = LocalDate.ofEpochDay(julianDay - JULIAN_EPOCH_OFFSET_DAYS);
LocalTime localTime = LocalTime.ofNanoOfDay(timeOfDayNanos);
return Timestamp.valueOf(LocalDateTime.of(localDate, localTime));
} else {
// local time
long mills = julianDayToMillis(julianDay) + (timeOfDayNanos / NANOS_PER_MILLISECOND);
Timestamp timestamp = new Timestamp(mills);
timestamp.setNanos((int) (timeOfDayNanos % TimeUnit.SECONDS.toNanos(1)));
return timestamp;
}
}
} else {
return g.getValueToString(index, 0);
}
} else {
return g.getValueToString(index, 0);
}
}
}
}

View File

@ -0,0 +1,21 @@
package com.alibaba.datax.plugin.reader.hdfsreader;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
/**
* Created by wmy on 16/11/29.
*/
public class HdfsPathFilter implements PathFilter {
private String regex = null;
public HdfsPathFilter(String regex) {
this.regex = regex;
}
@Override
public boolean accept(Path path) {
return regex != null ? path.getName().matches(regex) : true;
}
}

View File

@ -41,6 +41,8 @@ public class HdfsReader extends Reader {
private String specifiedFileType = null;
private DFSUtil dfsUtil = null;
private List<String> path = null;
private boolean skipEmptyOrcFile = false;
private Integer orcFileEmptySize = null;
@Override
public void init() {
@ -81,9 +83,10 @@ public class HdfsReader extends Reader {
!specifiedFileType.equalsIgnoreCase(Constant.TEXT) &&
!specifiedFileType.equalsIgnoreCase(Constant.CSV) &&
!specifiedFileType.equalsIgnoreCase(Constant.SEQ) &&
!specifiedFileType.equalsIgnoreCase(Constant.RC)){
String message = "HdfsReader插件目前支持ORC, TEXT, CSV, SEQUENCE, RC五种格式的文件," +
"请将fileType选项的值配置为ORC, TEXT, CSV, SEQUENCE 或者 RC";
!specifiedFileType.equalsIgnoreCase(Constant.RC) &&
!specifiedFileType.equalsIgnoreCase(Constant.PARQUET)){
String message = "HdfsReader插件目前支持ORC, TEXT, CSV, SEQUENCE, RC, PARQUET 六种格式的文件," +
"请将fileType选项的值配置为ORC, TEXT, CSV, SEQUENCE,RC 和 PARQUET";
throw DataXException.asDataXException(HdfsReaderErrorCode.FILE_TYPE_ERROR, message);
}
@ -115,6 +118,16 @@ public class HdfsReader extends Reader {
UnstructuredStorageReaderUtil.validateCompress(this.readerOriginConfig);
UnstructuredStorageReaderUtil.validateCsvReaderConfig(this.readerOriginConfig);
}
if (this.specifiedFileType.equalsIgnoreCase(Constant.ORC)) {
skipEmptyOrcFile = this.readerOriginConfig.getBool(Key.SKIP_EMPTY_ORCFILE, false);
orcFileEmptySize = this.readerOriginConfig.getInt(Key.ORCFILE_EMPTYSIZE);
//将orcFileEmptySize必填项检查去掉仅需要配置skipEmptyOrcFile即可考虑历史任务兼容性(For中华保险)保留orcFileEmptySize参数配置
//if (skipEmptyOrcFile && orcFileEmptySize == null) {
// throw new IllegalArgumentException("When \"skipEmptyOrcFile\" is configured, "
// + "parameter \"orcFileEmptySize\" cannot be null.");
//}
}
LOG.info("skipEmptyOrcFile: {}, orcFileEmptySize: {}", skipEmptyOrcFile, orcFileEmptySize);
}
@ -166,7 +179,7 @@ public class HdfsReader extends Reader {
@Override
public void prepare() {
LOG.info("prepare(), start to getAllFiles...");
this.sourceFiles = dfsUtil.getAllFiles(path, specifiedFileType);
this.sourceFiles = dfsUtil.getAllFiles(path, specifiedFileType,skipEmptyOrcFile, orcFileEmptySize);
LOG.info(String.format("您即将读取的文件数为: [%s], 列表为: [%s]",
this.sourceFiles.size(),
StringUtils.join(this.sourceFiles, ",")));
@ -273,7 +286,9 @@ public class HdfsReader extends Reader {
}else if(specifiedFileType.equalsIgnoreCase(Constant.RC)){
dfsUtil.rcFileStartRead(sourceFile, this.taskConfig, recordSender, this.getTaskPluginCollector());
}else {
} else if (specifiedFileType.equalsIgnoreCase(Constant.PARQUET)) {
dfsUtil.parquetFileStartRead(sourceFile, this.taskConfig, recordSender, this.getTaskPluginCollector());
} else {
String message = "HdfsReader插件目前支持ORC, TEXT, CSV, SEQUENCE, RC五种格式的文件," +
"请将fileType选项的值配置为ORC, TEXT, CSV, SEQUENCE 或者 RC";

View File

@ -19,7 +19,12 @@ public enum HdfsReaderErrorCode implements ErrorCode {
FILE_TYPE_UNSUPPORT("HdfsReader-12", "文件类型目前不支持"),
KERBEROS_LOGIN_ERROR("HdfsReader-13", "KERBEROS认证失败"),
READ_SEQUENCEFILE_ERROR("HdfsReader-14", "读取SequenceFile文件出错"),
READ_RCFILE_ERROR("HdfsReader-15", "读取RCFile文件出错"),;
READ_RCFILE_ERROR("HdfsReader-15", "读取RCFile文件出错"),
INIT_RCFILE_SERDE_ERROR("HdfsReader-16", "Deserialize RCFile, initialization failed!"),
PARSE_MESSAGE_TYPE_FROM_SCHEMA_ERROR("HdfsReader-17", "Error parsing ParquetSchema"),
INVALID_PARQUET_SCHEMA("HdfsReader-18", "ParquetSchema is invalid"),
READ_PARQUET_ERROR("HdfsReader-19", "Error reading Parquet file"),
CONNECT_HDFS_IO_ERROR("HdfsReader-20", "I/O exception in establishing connection with HDFS");
private final String code;
private final String description;

View File

@ -7,9 +7,60 @@ public final class Key {
*/
public final static String PATH = "path";
public final static String DEFAULT_FS = "defaultFS";
public final static String HIVE_VERSION = "hiveVersion";
public static final String FILETYPE = "fileType";
public static final String HADOOP_CONFIG = "hadoopConfig";
public static final String HAVE_KERBEROS = "haveKerberos";
public static final String KERBEROS_KEYTAB_FILE_PATH = "kerberosKeytabFilePath";
public static final String KERBEROS_CONF_FILE_PATH = "kerberosConfFilePath";
public static final String KERBEROS_PRINCIPAL = "kerberosPrincipal";
public static final String PATH_FILTER = "pathFilter";
public static final String PARQUET_SCHEMA = "parquetSchema";
/**
* hive 3.x cdh高版本使用UTC时区存储时间戳如果发现时区偏移该配置项要配置成 true
*/
public static final String PARQUET_UTC_TIMESTAMP = "parquetUtcTimestamp";
public static final String SUCCESS_ON_NO_FILE = "successOnNoFile";
public static final String PROTECTION = "protection";
/**
* 用于显示地指定hdfs客户端的用户名
*/
public static final String HDFS_USERNAME = "hdfsUsername";
/**
* ORC FILE空文件大小
*/
public static final String ORCFILE_EMPTYSIZE = "orcFileEmptySize";
/**
* 是否跳过空的OrcFile
*/
public static final String SKIP_EMPTY_ORCFILE = "skipEmptyOrcFile";
/**
* 是否跳过 orc meta 信息
*/
public static final String SKIP_ORC_META = "skipOrcMetaInfo";
/**
* 过滤_或者.开头的文件
*/
public static final String REGEX_PATTERN = "^.*[/][^._].*";
public static final String FILTER_TAG_FILE = "filterTagFile";
// high level params refs https://github.com/aliyun/alibabacloud-jindodata/blob/master/docs/user/4.x/4.4.0/oss/configuration/jindosdk_configuration_list.md
// <!-- oss 并发下载任务队列大小 -->
public static final String FS_OSS_DOWNLOAD_QUEUE_SIZE = "ossDownloadQueueSize";
// <!-- 进程内 oss 最大并发下载任务数 -->
public static final String FS_OSS_DOWNLOAD_THREAD_CONCURRENCY = "ossDownloadThreadConcurrency";
public static final String FS_OSS_READ_READAHEAD_BUFFER_COUNT = "ossDownloadBufferCount";
public static final String FILE_SYSTEM_TYPE = "fileSystemType";
public static final String CDH_3_X_HIVE_VERSION = "3.1.3-cdh";
public static final String SUPPORT_ADD_MIDDLE_COLUMN = "supportAddMiddleColumn";
}

View File

@ -0,0 +1,33 @@
package com.alibaba.datax.plugin.reader.hdfsreader;
import org.apache.parquet.schema.OriginalType;
import org.apache.parquet.schema.PrimitiveType;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* @author jitongchen
* @date 2023/9/7 10:20 AM
*/
public class ParquetMessageHelper {
public static Map<String, ParquetMeta> parseParquetTypes(List<org.apache.parquet.schema.Type> parquetTypes) {
int fieldCount = parquetTypes.size();
Map<String, ParquetMeta> parquetMetaMap = new HashMap<String, ParquetMeta>();
for (int i = 0; i < fieldCount; i++) {
org.apache.parquet.schema.Type type = parquetTypes.get(i);
String name = type.getName();
ParquetMeta parquetMeta = new ParquetMeta();
parquetMeta.setName(name);
OriginalType originalType = type.getOriginalType();
parquetMeta.setOriginalType(originalType);
if (type.isPrimitive()) {
PrimitiveType primitiveType = type.asPrimitiveType();
parquetMeta.setPrimitiveType(primitiveType);
}
parquetMetaMap.put(name, parquetMeta);
}
return parquetMetaMap;
}
}

View File

@ -0,0 +1,38 @@
package com.alibaba.datax.plugin.reader.hdfsreader;
import org.apache.parquet.schema.OriginalType;
import org.apache.parquet.schema.PrimitiveType;
/**
* @author jitongchen
* @date 2023/9/7 10:20 AM
*/
public class ParquetMeta {
private String name;
private OriginalType originalType;
private PrimitiveType primitiveType;
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public OriginalType getOriginalType() {
return originalType;
}
public void setOriginalType(OriginalType originalType) {
this.originalType = originalType;
}
public PrimitiveType getPrimitiveType() {
return primitiveType;
}
public void setPrimitiveType(PrimitiveType primitiveType) {
this.primitiveType = primitiveType;
}
}

View File

@ -27,9 +27,8 @@ import org.apache.hadoop.mapred.*;
import org.apache.hadoop.security.UserGroupInformation;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import parquet.schema.OriginalType;
import parquet.schema.PrimitiveType;
import parquet.schema.Types;
import parquet.hadoop.metadata.CompressionCodecName;
import parquet.schema.*;
import java.io.IOException;
import java.text.SimpleDateFormat;
@ -626,4 +625,61 @@ public class HdfsHelper {
}
return typeBuilder.named("m").toString();
}
public void parquetFileStartWrite(RecordReceiver lineReceiver, Configuration config, String fileName, TaskPluginCollector taskPluginCollector, Configuration taskConfig) {
MessageType messageType = null;
ParquetFileProccessor proccessor = null;
Path outputPath = new Path(fileName);
String schema = config.getString(Key.PARQUET_SCHEMA);
try {
messageType = MessageTypeParser.parseMessageType(schema);
} catch (Exception e) {
String message = String.format("Error parsing the Schema string [%s] into MessageType", schema);
LOG.error(message);
throw DataXException.asDataXException(HdfsWriterErrorCode.PARSE_MESSAGE_TYPE_FROM_SCHEMA_ERROR, e);
}
// determine the compression codec
String compress = config.getString(Key.COMPRESS, null);
// be compatible with the old NONE
if ("NONE".equalsIgnoreCase(compress)) {
compress = "UNCOMPRESSED";
}
CompressionCodecName compressionCodecName = CompressionCodecName.fromConf(compress);
LOG.info("The compression codec used for parquet writing is: {}", compressionCodecName, compress);
try {
proccessor = new ParquetFileProccessor(outputPath, messageType, compressionCodecName, false, taskConfig, taskPluginCollector, hadoopConf);
} catch (Exception e) {
String message = String.format("Initializing ParquetFileProccessor based on Schema[%s] failed.", schema);
LOG.error(message);
throw DataXException.asDataXException(HdfsWriterErrorCode.INIT_PROCCESSOR_FAILURE, e);
}
SimpleDateFormat dateFormat = new SimpleDateFormat("yyyyMMddHHmm");
String attempt = "attempt_" + dateFormat.format(new Date()) + "_0001_m_000000_0";
conf.set(JobContext.TASK_ATTEMPT_ID, attempt);
FileOutputFormat outFormat = new TextOutputFormat();
outFormat.setOutputPath(conf, outputPath);
outFormat.setWorkOutputPath(conf, outputPath);
try {
Record record = null;
while ((record = lineReceiver.getFromReader()) != null) {
proccessor.write(record);
}
} catch (Exception e) {
String message = String.format("An exception occurred while writing the file file [%s]", fileName);
LOG.error(message);
Path path = new Path(fileName);
deleteDir(path.getParent());
throw DataXException.asDataXException(HdfsWriterErrorCode.Write_FILE_IO_ERROR, e);
} finally {
if (proccessor != null) {
try {
proccessor.close();
} catch (IOException e) {
LOG.error(e.getMessage(), e);
}
}
}
}
}

View File

@ -53,8 +53,8 @@ public class HdfsWriter extends Writer {
this.defaultFS = this.writerSliceConfig.getNecessaryValue(Key.DEFAULT_FS, HdfsWriterErrorCode.REQUIRED_VALUE);
//fileType check
this.fileType = this.writerSliceConfig.getNecessaryValue(Key.FILE_TYPE, HdfsWriterErrorCode.REQUIRED_VALUE);
if( !fileType.equalsIgnoreCase("ORC") && !fileType.equalsIgnoreCase("TEXT")){
String message = "HdfsWriter插件目前只支持ORC和TEXT两种格式的文件,请将filetype选项的值配置为ORC或者TEXT";
if (!fileType.equalsIgnoreCase("ORC") && !fileType.equalsIgnoreCase("TEXT") && !fileType.equalsIgnoreCase("PARQUET")) {
String message = "HdfsWriter插件目前只支持ORC、TEXT、PARQUET三种格式的文件,请将filetype选项的值配置为ORC、TEXT或PARQUET";
throw DataXException.asDataXException(HdfsWriterErrorCode.ILLEGAL_VALUE, message);
}
//path
@ -415,6 +415,9 @@ public class HdfsWriter extends Writer {
//写ORC FILE
hdfsHelper.orcFileStartWrite(lineReceiver,this.writerSliceConfig, this.fileName,
this.getTaskPluginCollector());
} else if (fileType.equalsIgnoreCase("PARQUET")) {
//写PARQUET FILE
hdfsHelper.parquetFileStartWrite(lineReceiver, this.writerSliceConfig, this.fileName, this.getTaskPluginCollector(), this.writerSliceConfig);
}
LOG.info("end do write");

View File

@ -16,7 +16,11 @@ public enum HdfsWriterErrorCode implements ErrorCode {
CONNECT_HDFS_IO_ERROR("HdfsWriter-06", "与HDFS建立连接时出现IO异常."),
COLUMN_REQUIRED_VALUE("HdfsWriter-07", "您column配置中缺失了必须填写的参数值."),
HDFS_RENAME_FILE_ERROR("HdfsWriter-08", "将文件移动到配置路径失败."),
KERBEROS_LOGIN_ERROR("HdfsWriter-09", "KERBEROS认证失败");
KERBEROS_LOGIN_ERROR("HdfsWriter-09", "KERBEROS认证失败"),
PARSE_MESSAGE_TYPE_FROM_SCHEMA_ERROR("HdfsWriter-10", "Parse parquet schema error"),
INIT_PROCCESSOR_FAILURE("HdfsWriter-11", "Init processor failed");
private final String code;
private final String description;

View File

@ -46,4 +46,32 @@ public class Key {
public static final String PARQUET_SCHEMA = "parquetSchema";
public static final String PARQUET_MERGE_RESULT = "parquetMergeResult";
/**
* hive 3.x cdh高版本使用UTC时区存储时间戳如果发现时区偏移该配置项要配置成 true
*/
public static final String PARQUET_UTC_TIMESTAMP = "parquetUtcTimestamp";
// Kerberos
public static final String KERBEROS_CONF_FILE_PATH = "kerberosConfFilePath";
// PanguFS
public final static String PANGU_FS_CONFIG = "panguFSConfig";
public final static String PANGU_FS_CONFIG_NUWA_CLUSTER = "nuwaCluster";
public final static String PANGU_FS_CONFIG_NUWA_SERVERS = "nuwaServers";
public final static String PANGU_FS_CONFIG_NUWA_PROXIES = "nuwaProxies";
public final static String PANGU_FS_CONFIG_CAPABILITY = "capability";
public static final String FS_OSS_UPLOAD_THREAD_CONCURRENCY = "ossUploadConcurrency";
// <!-- oss 并发上传任务队列大小 -->
public static final String FS_OSS_UPLOAD_QUEUE_SIZE = "ossUploadQueueSize";
// <!-- 进程内 oss 最大并发上传任务数 -->
public static final String FS_OSS_UPLOAD_MAX_PENDING_TASKS_PER_STREAM = "ossUploadMaxPendingTasksPerStream";
public static final String FS_OSS_BLOCKLET_SIZE_MB = "ossBlockSize";
public static final String FILE_SYSTEM_TYPE = "fileSystemType";
public static final String ENABLE_COLUMN_EXCHANGE = "enableColumnExchange";
public static final String SUPPORT_HIVE_DATETIME = "supportHiveDateTime";
}

View File

@ -0,0 +1,30 @@
package com.alibaba.datax.plugin.writer.hdfswriter;
import com.alibaba.datax.common.element.Record;
import com.alibaba.datax.common.plugin.TaskPluginCollector;
import com.alibaba.datax.common.util.Configuration;
import org.apache.hadoop.fs.Path;
import parquet.hadoop.ParquetWriter;
import parquet.hadoop.metadata.CompressionCodecName;
import parquet.schema.MessageType;
import java.io.IOException;
/**
* @author jitongchen
* @date 2023/9/7 9:41 AM
*/
public class ParquetFileProccessor extends ParquetWriter<Record> {
public ParquetFileProccessor(Path file, MessageType schema, boolean enableDictionary, Configuration taskConfig, TaskPluginCollector taskPluginCollector, org.apache.hadoop.conf.Configuration configuration) throws IOException {
this(file, schema, CompressionCodecName.UNCOMPRESSED, enableDictionary, taskConfig, taskPluginCollector, configuration);
}
public ParquetFileProccessor(Path file, MessageType schema, CompressionCodecName codecName, boolean enableDictionary, Configuration taskConfig, TaskPluginCollector taskPluginCollector) throws IOException {
super(file, new ParquetFileSupport(schema, taskConfig, taskPluginCollector), codecName, DEFAULT_BLOCK_SIZE, DEFAULT_PAGE_SIZE, DEFAULT_PAGE_SIZE, enableDictionary, false, DEFAULT_WRITER_VERSION);
}
public ParquetFileProccessor(Path file, MessageType schema, CompressionCodecName codecName, boolean enableDictionary, Configuration taskConfig, TaskPluginCollector taskPluginCollector, org.apache.hadoop.conf.Configuration configuration) throws IOException {
super(file, new ParquetFileSupport(schema, taskConfig, taskPluginCollector), codecName, DEFAULT_BLOCK_SIZE, DEFAULT_PAGE_SIZE, DEFAULT_PAGE_SIZE, enableDictionary, false, DEFAULT_WRITER_VERSION, configuration);
}
}

View File

@ -0,0 +1,642 @@
package com.alibaba.datax.plugin.writer.hdfswriter;
import com.alibaba.datax.common.element.*;
import com.alibaba.datax.common.plugin.TaskPluginCollector;
import com.alibaba.datax.common.util.LimitLogger;
import com.alibaba.fastjson2.JSON;
import com.alibaba.fastjson2.JSONArray;
import com.alibaba.fastjson2.JSONObject;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import parquet.column.ColumnDescriptor;
import parquet.hadoop.api.WriteSupport;
import parquet.io.api.Binary;
import parquet.io.api.RecordConsumer;
import parquet.schema.*;
import java.math.BigDecimal;
import java.math.RoundingMode;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.sql.Timestamp;
import java.text.SimpleDateFormat;
import java.time.LocalDateTime;
import java.time.OffsetDateTime;
import java.time.ZoneOffset;
import java.time.temporal.ChronoField;
import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.concurrent.TimeUnit;
/**
* @author jitongchen
* @date 2023/9/7 9:41 AM
*/
public class ParquetFileSupport extends WriteSupport<Record> {
public static final Logger LOGGER = LoggerFactory.getLogger(ParquetFileSupport.class);
private MessageType schema;
private List<ColumnDescriptor> columns;
private RecordConsumer recordConsumer;
private boolean useRawDataTransf = true;
private boolean printStackTrace = true;
// 不通类型的nullFormat
private String nullFormat;
private String dateFormat;
private boolean isUtcTimestamp;
private SimpleDateFormat dateParse;
private Binary binaryForNull;
private TaskPluginCollector taskPluginCollector;
private String dataxParquetMode;
public ParquetFileSupport(MessageType schema, com.alibaba.datax.common.util.Configuration taskConfig, TaskPluginCollector taskPluginCollector) {
this.schema = schema;
this.columns = schema.getColumns();
this.useRawDataTransf = taskConfig.getBool(Key.PARQUET_FILE_USE_RAW_DATA_TRANSF, true);
// 不通类型的nullFormat
this.nullFormat = taskConfig.getString(Key.NULL_FORMAT, Constant.DEFAULT_NULL_FORMAT);
this.binaryForNull = Binary.fromString(this.nullFormat);
this.dateFormat = taskConfig.getString(Key.DATE_FORMAT, null);
if (StringUtils.isNotBlank(this.dateFormat)) {
this.dateParse = new SimpleDateFormat(dateFormat);
}
this.isUtcTimestamp = taskConfig.getBool(Key.PARQUET_UTC_TIMESTAMP, false);
this.taskPluginCollector = taskPluginCollector;
if (taskConfig.getKeys().contains("dataxParquetMode")) {
this.dataxParquetMode = taskConfig.getString("dataxParquetMode");
} else {
// 默认值是columns
this.dataxParquetMode = "columns";
}
}
@Override
public WriteContext init(Configuration configuration) {
return new WriteContext(schema, new HashMap<String, String>());
}
@Override
public void prepareForWrite(RecordConsumer recordConsumer) {
this.recordConsumer = recordConsumer;
}
@Override
public void write(Record values) {
if (dataxParquetMode.equalsIgnoreCase("fields")) {
writeBaseOnFields(values);
return;
}
// NOTE: 下面的实现其实是不对的只是看代码注释貌似有用户已经在用
// 所以暂时不动下面的逻辑
// 默认走的就是下面的这条代码路径
if (values != null && columns != null && values.getColumnNumber() == columns.size()) {
recordConsumer.startMessage();
for (int i = 0; i < columns.size(); i++) {
Column value = values.getColumn(i);
ColumnDescriptor columnDescriptor = columns.get(i);
Type type = this.schema.getFields().get(i);
if (value != null) {
try {
if (this.useRawDataTransf) {
if (value.getRawData() == null) {
continue;
}
recordConsumer.startField(columnDescriptor.getPath()[0], i);
// 原来使用Column->RawData的方法其实是错误的类型转换策略会将DataX的数据内部表示形象序列化出去
// 但是 Parquet 已经有用户使用了故暂时只是配置项切换
String rawData = value.getRawData().toString();
switch (columnDescriptor.getType()) {
case BOOLEAN:
recordConsumer.addBoolean(Boolean.parseBoolean(rawData));
break;
case FLOAT:
recordConsumer.addFloat(Float.parseFloat(rawData));
break;
case DOUBLE:
recordConsumer.addDouble(Double.parseDouble(rawData));
break;
case INT32:
OriginalType originalType = type.getOriginalType();
if (originalType != null && StringUtils.equalsIgnoreCase("DATE", originalType.name())) {
int realVal = (int) (new java.sql.Date(Long.parseLong(rawData)).toLocalDate().toEpochDay());
recordConsumer.addInteger(realVal);
} else {
recordConsumer.addInteger(Integer.parseInt(rawData));
}
break;
case INT64:
recordConsumer.addLong(Long.valueOf(rawData));
break;
case INT96:
recordConsumer.addBinary(timestampColToBinary(value));
break;
case BINARY:
recordConsumer.addBinary(Binary.fromString(rawData));
break;
case FIXED_LEN_BYTE_ARRAY:
PrimitiveType primitiveType = type.asPrimitiveType();
if (primitiveType.getDecimalMetadata() != null) {
// decimal
recordConsumer.addBinary(decimalToBinary(value, primitiveType.getDecimalMetadata().getPrecision(), primitiveType.getDecimalMetadata().getScale()));
break;
}
/* fall through */
default:
recordConsumer.addBinary(Binary.fromString(rawData));
break;
}
recordConsumer.endField(columnDescriptor.getPath()[0], i);
} else {
boolean isNull = null == value.getRawData();
if (!isNull) {
recordConsumer.startField(columnDescriptor.getPath()[0], i);
// no skip: empty fields are illegal, the field should be ommited completely instead
switch (columnDescriptor.getType()) {
case BOOLEAN:
recordConsumer.addBoolean(value.asBoolean());
break;
case FLOAT:
recordConsumer.addFloat(value.asDouble().floatValue());
break;
case DOUBLE:
recordConsumer.addDouble(value.asDouble());
break;
case INT32:
OriginalType originalType = type.getOriginalType();
if (originalType != null && StringUtils.equalsIgnoreCase("DATE", originalType.name())) {
int realVal = (int) (new java.sql.Date(value.asLong()).toLocalDate().toEpochDay());
recordConsumer.addInteger(realVal);
} else {
recordConsumer.addInteger(value.asLong().intValue());
}
break;
case INT64:
recordConsumer.addLong(value.asLong());
break;
case INT96:
recordConsumer.addBinary(timestampColToBinary(value));
break;
case BINARY:
String valueAsString2Write = null;
if (Column.Type.DATE == value.getType() && null != this.dateParse) {
valueAsString2Write = dateParse.format(value.asDate());
} else {
valueAsString2Write = value.asString();
}
recordConsumer.addBinary(Binary.fromString(valueAsString2Write));
break;
case FIXED_LEN_BYTE_ARRAY:
PrimitiveType primitiveType = type.asPrimitiveType();
if (primitiveType.getDecimalMetadata() != null) {
// decimal
recordConsumer.addBinary(decimalToBinary(value, primitiveType.getDecimalMetadata().getPrecision(), primitiveType.getDecimalMetadata().getScale()));
break;
}
/* fall through */
default:
recordConsumer.addBinary(Binary.fromString(value.asString()));
break;
}
recordConsumer.endField(columnDescriptor.getPath()[0], i);
}
}
} catch (Exception e) {
if (printStackTrace) {
printStackTrace = false;
LOGGER.warn("write to parquet error: {}", e.getMessage(), e);
}
// dirty data
if (null != this.taskPluginCollector) {
// job post 里面的merge taskPluginCollector 为null
this.taskPluginCollector.collectDirtyRecord(values, e, e.getMessage());
}
}
} else {
recordConsumer.addBinary(this.binaryForNull);
}
}
recordConsumer.endMessage();
}
}
private Binary decimalToBinary(Column value, int precision, int scale) {
BigDecimal bigDecimal = value.asBigDecimal();
bigDecimal = bigDecimal.setScale(scale, RoundingMode.HALF_UP);
byte[] decimalBytes = bigDecimal.unscaledValue().toByteArray();
int precToBytes = ParquetHiveSerDe.PRECISION_TO_BYTE_COUNT[precision - 1];
if (precToBytes == decimalBytes.length) {
// No padding needed.
return Binary.fromByteArray(decimalBytes);
}
byte[] tgt = new byte[precToBytes];
// padding -1 for negative number
if (bigDecimal.compareTo(new BigDecimal("0")) < 0) {
Arrays.fill(tgt, 0, precToBytes - decimalBytes.length, (byte) -1);
}
System.arraycopy(decimalBytes, 0, tgt, precToBytes - decimalBytes.length, decimalBytes.length);
return Binary.fromByteArray(tgt);
}
private static final int JULIAN_EPOCH_OFFSET_DAYS = 2_440_588;
private static final long MILLIS_IN_DAY = TimeUnit.DAYS.toMillis(1);
private static final long MILLS_PER_SECOND = TimeUnit.SECONDS.toMillis(1);
private static final long NANOS_PER_DAY = TimeUnit.DAYS.toNanos(1);
private static final long NANOS_PER_SECOND = TimeUnit.SECONDS.toNanos(1);
private static final ZoneOffset defaultOffset = OffsetDateTime.now().getOffset();
/**
* int 96 is timestamp in parquet
*
* @param valueColumn
* @return
*/
private Binary timestampColToBinary(Column valueColumn) {
if (valueColumn.getRawData() == null) {
return Binary.EMPTY;
}
long mills;
long nanos = 0;
if (valueColumn instanceof DateColumn) {
DateColumn dateColumn = (DateColumn) valueColumn;
mills = dateColumn.asLong();
nanos = dateColumn.getNanos();
} else {
mills = valueColumn.asLong();
}
int julianDay;
long nanosOfDay;
if (isUtcTimestamp) {
// utc ignore current timezone (task should set timezone same as hive/hdfs)
long seconds = mills >= 0 ? mills / MILLS_PER_SECOND : (mills / MILLS_PER_SECOND - 1);
LocalDateTime localDateTime = LocalDateTime.ofEpochSecond(seconds, (int) nanos, defaultOffset);
julianDay = (int) (localDateTime.getLong(ChronoField.EPOCH_DAY) + JULIAN_EPOCH_OFFSET_DAYS);
nanosOfDay = localDateTime.getLong(ChronoField.NANO_OF_DAY);
} else {
// local date
julianDay = (int) ((mills / MILLIS_IN_DAY) + JULIAN_EPOCH_OFFSET_DAYS);
if (mills >= 0) {
nanosOfDay = ((mills % MILLIS_IN_DAY) / MILLS_PER_SECOND) * NANOS_PER_SECOND + nanos;
} else {
julianDay--;
nanosOfDay = (((mills % MILLIS_IN_DAY) / MILLS_PER_SECOND) - 1) * NANOS_PER_SECOND + nanos;
nanosOfDay += NANOS_PER_DAY;
}
}
ByteBuffer buf = ByteBuffer.allocate(12);
buf.order(ByteOrder.LITTLE_ENDIAN);
buf.putLong(nanosOfDay);
buf.putInt(julianDay);
buf.flip();
return Binary.fromByteBuffer(buf);
}
private void writeBaseOnFields(Record values) {
//LOGGER.info("Writing parquet data using fields mode(The correct mode.)");
List<Type> types = this.schema.getFields();
if (values != null && types != null && values.getColumnNumber() == types.size()) {
recordConsumer.startMessage();
writeFields(types, values);
recordConsumer.endMessage();
}
}
private void writeFields(List<Type> types, Record values) {
for (int i = 0; i < types.size(); i++) {
Type type = types.get(i);
Column value = values.getColumn(i);
if (value != null) {
try {
if (type.isPrimitive()) {
writePrimitiveType(type, value, i);
} else {
writeGroupType(type, (JSON) JSON.parse(value.asString()), i);
}
} catch (Exception e) {
if (printStackTrace) {
printStackTrace = false;
LOGGER.warn("write to parquet error: {}", e.getMessage(), e);
}
// dirty data
if (null != this.taskPluginCollector) {
// job post 里面的merge taskPluginCollector 为null
this.taskPluginCollector.collectDirtyRecord(values, e, e.getMessage());
}
}
}
}
}
private void writeFields(List<Type> types, JSONObject values) {
for (int i = 0; i < types.size(); i++) {
Type type = types.get(i);
Object value = values.get(type.getName());
if (value != null) {
try {
if (type.isPrimitive()) {
writePrimitiveType(type, value, i);
} else {
writeGroupType(type, (JSON) value, i);
}
} catch (Exception e) {
if (printStackTrace) {
printStackTrace = false;
LOGGER.warn("write to parquet error: {}", e.getMessage(), e);
}
}
} else {
recordConsumer.addBinary(this.binaryForNull);
}
}
}
private void writeGroupType(Type type, JSON value, int index) {
GroupType groupType = type.asGroupType();
OriginalType originalType = groupType.getOriginalType();
if (originalType != null) {
switch (originalType) {
case MAP:
writeMap(groupType, value, index);
break;
case LIST:
writeList(groupType, value, index);
break;
default:
break;
}
} else {
// struct
writeStruct(groupType, value, index);
}
}
private void writeMap(GroupType groupType, JSON value, int index) {
if (value == null) {
return;
}
JSONObject json = (JSONObject) value;
if (json.isEmpty()) {
return;
}
recordConsumer.startField(groupType.getName(), index);
recordConsumer.startGroup();
// map
// key_value start
recordConsumer.startField("key_value", 0);
recordConsumer.startGroup();
List<Type> keyValueFields = groupType.getFields().get(0).asGroupType().getFields();
Type keyType = keyValueFields.get(0);
Type valueType = keyValueFields.get(1);
for (String key : json.keySet()) {
// key
writePrimitiveType(keyType, key, 0);
// value
if (valueType.isPrimitive()) {
writePrimitiveType(valueType, json.get(key), 1);
} else {
writeGroupType(valueType, (JSON) json.get(key), 1);
}
}
recordConsumer.endGroup();
recordConsumer.endField("key_value", 0);
// key_value end
recordConsumer.endGroup();
recordConsumer.endField(groupType.getName(), index);
}
private void writeList(GroupType groupType, JSON value, int index) {
if (value == null) {
return;
}
JSONArray json = (JSONArray) value;
if (json.isEmpty()) {
return;
}
recordConsumer.startField(groupType.getName(), index);
// list
recordConsumer.startGroup();
// list start
recordConsumer.startField("list", 0);
recordConsumer.startGroup();
Type elementType = groupType.getFields().get(0).asGroupType().getFields().get(0);
if (elementType.isPrimitive()) {
for (Object elementValue : json) {
writePrimitiveType(elementType, elementValue, 0);
}
} else {
for (Object elementValue : json) {
writeGroupType(elementType, (JSON) elementValue, 0);
}
}
recordConsumer.endGroup();
recordConsumer.endField("list", 0);
// list end
recordConsumer.endGroup();
recordConsumer.endField(groupType.getName(), index);
}
private void writeStruct(GroupType groupType, JSON value, int index) {
if (value == null) {
return;
}
JSONObject json = (JSONObject) value;
if (json.isEmpty()) {
return;
}
recordConsumer.startField(groupType.getName(), index);
// struct start
recordConsumer.startGroup();
writeFields(groupType.getFields(), json);
recordConsumer.endGroup();
// struct end
recordConsumer.endField(groupType.getName(), index);
}
private void writePrimitiveType(Type type, Object value, int index) {
if (value == null) {
return;
}
recordConsumer.startField(type.getName(), index);
PrimitiveType primitiveType = type.asPrimitiveType();
switch (primitiveType.getPrimitiveTypeName()) {
case BOOLEAN:
recordConsumer.addBoolean((Boolean) value);
break;
case FLOAT:
if (value instanceof Float) {
recordConsumer.addFloat(((Float) value).floatValue());
} else if (value instanceof Double) {
recordConsumer.addFloat(((Double) value).floatValue());
} else if (value instanceof Long) {
recordConsumer.addFloat(((Long) value).floatValue());
} else if (value instanceof Integer) {
recordConsumer.addFloat(((Integer) value).floatValue());
}
break;
case DOUBLE:
if (value instanceof Float) {
recordConsumer.addDouble(((Float) value).doubleValue());
} else if (value instanceof Double) {
recordConsumer.addDouble(((Double) value).doubleValue());
} else if (value instanceof Long) {
recordConsumer.addDouble(((Long) value).doubleValue());
} else if (value instanceof Integer) {
recordConsumer.addDouble(((Integer) value).doubleValue());
}
break;
case INT32:
if (value instanceof Integer) {
recordConsumer.addInteger((Integer) value);
} else if (value instanceof Long) {
recordConsumer.addInteger(((Long) value).intValue());
} else {
// 之前代码写的有问题导致这里丢列了没抛异常先收集后续看看有没有任务命中在决定怎么改
LimitLogger.limit("dirtyDataHiveWriterParquet", TimeUnit.MINUTES.toMillis(1), () -> LOGGER.warn("dirtyDataHiveWriterParquet {}", String.format("Invalid value: %s(clazz: %s) for field: %s", value, value.getClass(), type.getName())));
}
break;
case INT64:
if (value instanceof Integer) {
recordConsumer.addLong(((Integer) value).longValue());
} else if (value instanceof Long) {
recordConsumer.addInteger(((Long) value).intValue());
} else {
// 之前代码写的有问题导致这里丢列了没抛异常先收集后续看看有没有任务命中在决定怎么改
LimitLogger.limit("dirtyDataHiveWriterParquet", TimeUnit.MINUTES.toMillis(1), () -> LOGGER.warn("dirtyDataHiveWriterParquet {}", String.format("Invalid value: %s(clazz: %s) for field: %s", value, value.getClass(), type.getName())));
}
break;
case INT96:
if (value instanceof Integer) {
recordConsumer.addBinary(timestampColToBinary(new LongColumn((Integer) value)));
} else if (value instanceof Long) {
recordConsumer.addBinary(timestampColToBinary(new LongColumn((Long) value)));
} else if (value instanceof Timestamp) {
recordConsumer.addBinary(timestampColToBinary(new DateColumn((Timestamp) value)));
} else if (value instanceof Date) {
recordConsumer.addBinary(timestampColToBinary(new DateColumn((Date) value)));
} else {
recordConsumer.addBinary(timestampColToBinary(new StringColumn(value.toString())));
}
break;
case FIXED_LEN_BYTE_ARRAY:
if (primitiveType.getDecimalMetadata() != null) {
// decimal
Column column;
if (value instanceof Integer) {
column = new LongColumn((Integer) value);
} else if (value instanceof Long) {
column = new LongColumn((Long) value);
} else if (value instanceof Double) {
column = new DoubleColumn((Double) value);
} else if (value instanceof BigDecimal) {
column = new DoubleColumn((BigDecimal) value);
} else {
column = new StringColumn(value.toString());
}
recordConsumer.addBinary(decimalToBinary(column, primitiveType.getDecimalMetadata().getPrecision(), primitiveType.getDecimalMetadata().getScale()));
break;
}
/* fall through */
case BINARY:
default:
recordConsumer.addBinary(Binary.fromString((String) value));
break;
}
recordConsumer.endField(type.getName(), index);
}
private void writePrimitiveType(Type type, Column value, int index) {
if (value == null || value.getRawData() == null) {
return;
}
recordConsumer.startField(type.getName(), index);
PrimitiveType primitiveType = type.asPrimitiveType();
switch (primitiveType.getPrimitiveTypeName()) {
case BOOLEAN:
recordConsumer.addBoolean(value.asBoolean());
break;
case FLOAT:
recordConsumer.addFloat(value.asDouble().floatValue());
break;
case DOUBLE:
recordConsumer.addDouble(value.asDouble());
break;
case INT32:
OriginalType originalType = type.getOriginalType();
if (OriginalType.DATE.equals(originalType)) {
int realVal = (int) (new java.sql.Date(value.asLong()).toLocalDate().toEpochDay());
recordConsumer.addInteger(realVal);
} else {
recordConsumer.addInteger(value.asLong().intValue());
}
break;
case INT64:
recordConsumer.addLong(value.asLong());
break;
case INT96:
recordConsumer.addBinary(timestampColToBinary(value));
break;
case BINARY:
String valueAsString2Write = null;
if (Column.Type.DATE == value.getType() && null != this.dateParse) {
valueAsString2Write = dateParse.format(value.asDate());
} else {
valueAsString2Write = value.asString();
}
recordConsumer.addBinary(Binary.fromString(valueAsString2Write));
break;
case FIXED_LEN_BYTE_ARRAY:
if (primitiveType.getDecimalMetadata() != null) {
// decimal
recordConsumer.addBinary(decimalToBinary(value, primitiveType.getDecimalMetadata().getPrecision(), primitiveType.getDecimalMetadata().getScale()));
break;
}
/* fall through */
default:
recordConsumer.addBinary(Binary.fromString(value.asString()));
break;
}
recordConsumer.endField(type.getName(), index);
}
}

View File

@ -0,0 +1,193 @@
# DataX neo4jWriter 插件文档
## 功能简介
本目前市面上的neo4j 批量导入主要有Cypher Create,Load CSV,第三方或者官方提供的Batch Import。Load CSV支持节点10W级别一下Batch Import 需要对数据库进行停机。要想实现不停机的数据写入Cypher是最好的方式。
## 支持版本
支持Neo4j 4 和Neo4j 5,如果是Neo4j 3,需要自行将驱动降低至相对应的版本进行编译。
## 实现原理
将datax的数据转换成了neo4j驱动能识别的对象利用 unwind 语法进行批量插入。
## 如何配置
### 配置项介绍
| 配置 | 说明 | 是否必须 | 默认值 | 示例 |
|:-------------------------------|--------------------| -------- | ------ | ---------------------------------------------------- |
| database | 数据库名字 | 是 | - | neo4j |
| uri | 数据库访问链接 | 是 | - | bolt://localhost:7687 |
| username | 访问用户名 | 是 | - | neo4j |
| password | 访问密码 | 是 | - | neo4j |
| bearerToken | 权限相关 | 否 | - | - |
| kerberosTicket | 权限相关 | 否 | - | - |
| cypher | 同步语句 | 是 | - | unwind $batch as row create(p) set p.name = row.name |
| batchDataVariableName | unwind 携带的数据变量名 | | | batch |
| properties | 定义neo4j中数据的属性名字和类型 | 是 | - | 见后续案例 |
| batchSize | 一批写入数据量 | 否 | 1000 | |
| maxTransactionRetryTimeSeconds | 事务运行最长时间 | 否 | 30秒 | 30 |
| maxConnectionTimeoutSeconds | 驱动最长链接时间 | 否 | 30秒 | 30 |
| retryTimes | 发生错误的重试次数 | 否 | 3次 | 3 |
| retrySleepMills | 重试失败后的等待时间 | 否 | 3秒 | 3 |
### 支持的数据类型
> 配置时均忽略大小写
```
BOOLEAN,
STRING,
LONG,
SHORT,
INTEGER,
DOUBLE,
FLOAT,
LOCAL_DATE,
LOCAL_TIME,
LOCAL_DATE_TIME,
LIST,
//map类型支持 . 属性表达式取值
MAP,
CHAR_ARRAY,
BYTE_ARRAY,
BOOLEAN_ARRAY,
STRING_ARRAY,
LONG_ARRAY,
INT_ARRAY,
SHORT_ARRAY,
DOUBLE_ARRAY,
FLOAT_ARRAY,
Object_ARRAY
```
### 写节点
这里提供了一个写节点包含很多类型属性的例子。你可以在我的测试方法中运行。
```json
"writer": {
"name": "neo4jWriter",
"parameter": {
"uri": "neo4j://localhost:7687",
"username": "neo4j",
"password": "Test@12343",
"database": "neo4j",
"cypher": "unwind $batch as row create(p:Person) set p.pbool = row.pbool,p.pstring = row.pstring,p.plong = row.plong,p.pshort = row.pshort,p.pdouble=row.pdouble,p.pstringarr=row.pstringarr,p.plocaldate=row.plocaldate",
"batchDataVariableName": "batch",
"batchSize": "33",
"properties": [
{
"name": "pbool",
"type": "BOOLEAN"
},
{
"name": "pstring",
"type": "STRING"
},
{
"name": "plong",
"type": "LONG"
},
{
"name": "pshort",
"type": "SHORT"
},
{
"name": "pdouble",
"type": "DOUBLE"
},
{
"name": "pstringarr",
"type": "STRING_ARRAY",
"split": ","
},
{
"name": "plocaldate",
"type": "LOCAL_DATE",
"dateFormat": "yyyy-MM-dd"
}
]
}
}
```
### 写关系
```json
"writer": {
"name": "neo4jWriter",
"parameter": {
"uri": "neo4j://localhost:7687",
"username": "neo4j",
"password": "Test@12343",
"database": "neo4j",
"cypher": "unwind $batch as row match(p1:Person) where p1.id = row.startNodeId match(p2:Person) where p2.id = row.endNodeId create (p1)-[:LINK]->(p2)",
"batchDataVariableName": "batch",
"batch_size": "33",
"properties": [
{
"name": "startNodeId",
"type": "STRING"
},
{
"name": "endNodeId",
"type": "STRING"
}
]
}
}
```
### 节点/关系类型动态写
> 需要使用AOPC函数拓展如果你的数据库没有请安装APOC函数拓展
```json
"writer": {
"name": "neo4jWriter",
"parameter": {
"uri": "bolt://localhost:7687",
"username": "yourUserName",
"password": "yourPassword",
"database": "yourDataBase",
"cypher": "unwind $batch as row CALL apoc.cypher.doIt( 'create (n:`' + row.Label + '`{id:$id})' ,{id: row.id} ) YIELD value RETURN 1 ",
"batchDataVariableName": "batch",
"batch_size": "1",
"properties": [
{
"name": "Label",
"type": "STRING"
},
{
"name": "id",
"type": "STRING"
}
]
}
}
```
## 注意事项
* properties定义的顺序需要与reader端顺序一一对应。
* 灵活使用map类型可以免去很多数据加工的烦恼。在cypher中可以根据 . 属性访问符号一直取值。比如 unwind $batch as row create (p) set p.name = row.prop.name,set p.age = row.prop.age在这个例子中prop是map类型包含name和age两个属性。
* 如果提示事务超时建议调大事务运行时间或者调小batchSize
* 如果用于更新场景,遇到死锁问题影响写入,建议二开源码加入死锁异常检测,并进行重试。
## 性能报告
**JVM参数**
16G G1垃圾收集器 8核心
**Neo4j数据库配置**
32核心256G
**datax 配置**
* Channel 20 batchsize = 1000
* 任务平均流量15.23MB/s
* 记录写入速度44440 rec/s
* 读出记录总数2222013

100
neo4jwriter/pom.xml Normal file
View File

@ -0,0 +1,100 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<groupId>com.alibaba.datax</groupId>
<artifactId>datax-all</artifactId>
<version>0.0.1-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>neo4jwriter</artifactId>
<name>neo4jwriter</name>
<packaging>jar</packaging>
<properties>
<maven.compiler.source>8</maven.compiler.source>
<maven.compiler.target>8</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<neo4j-java-driver.version>4.4.9</neo4j-java-driver.version>
<junit4.version>4.13.2</junit4.version>
<test.container.version>1.17.6</test.container.version>
</properties>
<dependencies>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
</dependency>
<dependency>
<groupId>ch.qos.logback</groupId>
<artifactId>logback-classic</artifactId>
</dependency>
<dependency>
<groupId>org.neo4j.driver</groupId>
<artifactId>neo4j-java-driver</artifactId>
<version>${neo4j-java-driver.version}</version>
</dependency>
<dependency>
<groupId>com.alibaba.datax</groupId>
<artifactId>datax-common</artifactId>
<version>${datax-project-version}</version>
</dependency>
<dependency>
<groupId>org.testcontainers</groupId>
<artifactId>testcontainers</artifactId>
<version>${test.container.version}</version>
</dependency>
<!-- Testcontainers 1.x is tightly coupled with the JUnit 4.x rule API-->
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>${junit4.version}</version>
<scope>test</scope>
</dependency>
</dependencies>
<build>
<resources>
<resource>
<directory>src/main/resources</directory>
<includes>
<include>**/*.*</include>
</includes>
<filtering>true</filtering>
</resource>
</resources>
<plugins>
<!-- compiler plugin -->
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>${jdk-version}</source>
<target>${jdk-version}</target>
<encoding>${project-sourceEncoding}</encoding>
</configuration>
</plugin>
<!-- assembly plugin -->
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<descriptors>
<descriptor>src/main/assembly/package.xml</descriptor>
</descriptors>
<finalName>datax</finalName>
</configuration>
<executions>
<execution>
<id>dwzip</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>

View File

@ -0,0 +1,35 @@
<assembly
xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0 http://maven.apache.org/xsd/assembly-1.1.0.xsd">
<id></id>
<formats>
<format>dir</format>
</formats>
<includeBaseDirectory>false</includeBaseDirectory>
<fileSets>
<fileSet>
<directory>src/main/resources</directory>
<includes>
<include>plugin.json</include>
<include>plugin_job_template.json</include>
</includes>
<outputDirectory>plugin/writer/neo4jwriter</outputDirectory>
</fileSet>
<fileSet>
<directory>target/</directory>
<includes>
<include>neo4jwriter-0.0.1-SNAPSHOT.jar</include>
</includes>
<outputDirectory>plugin/writer/neo4jwriter</outputDirectory>
</fileSet>
</fileSets>
<dependencySets>
<dependencySet>
<useProjectArtifact>false</useProjectArtifact>
<outputDirectory>plugin/writer/neo4jwriter/libs</outputDirectory>
<scope>runtime</scope>
</dependencySet>
</dependencySets>
</assembly>

View File

@ -0,0 +1,256 @@
package com.alibaba.datax.plugin.writer.neo4jwriter;
import com.alibaba.datax.common.element.Column;
import com.alibaba.datax.common.element.Record;
import com.alibaba.datax.common.exception.DataXException;
import com.alibaba.datax.common.plugin.TaskPluginCollector;
import com.alibaba.datax.common.util.Configuration;
import com.alibaba.datax.common.util.RetryUtil;
import com.alibaba.datax.plugin.writer.neo4jwriter.adapter.DateAdapter;
import com.alibaba.datax.plugin.writer.neo4jwriter.adapter.ValueAdapter;
import com.alibaba.datax.plugin.writer.neo4jwriter.config.Neo4jProperty;
import com.alibaba.datax.plugin.writer.neo4jwriter.exception.Neo4jErrorCode;
import com.alibaba.fastjson2.JSON;
import org.apache.commons.lang3.StringUtils;
import org.neo4j.driver.*;
import org.neo4j.driver.exceptions.Neo4jException;
import org.neo4j.driver.internal.value.MapValue;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.*;
import java.util.concurrent.TimeUnit;
import static com.alibaba.datax.plugin.writer.neo4jwriter.config.ConfigConstants.*;
import static com.alibaba.datax.plugin.writer.neo4jwriter.exception.Neo4jErrorCode.DATABASE_ERROR;
public class Neo4jClient {
private static final Logger LOGGER = LoggerFactory.getLogger(Neo4jClient.class);
private Driver driver;
private WriteConfig writeConfig;
private RetryConfig retryConfig;
private TaskPluginCollector taskPluginCollector;
private Session session;
private List<MapValue> writerBuffer;
public Neo4jClient(Driver driver,
WriteConfig writeConfig,
RetryConfig retryConfig,
TaskPluginCollector taskPluginCollector) {
this.driver = driver;
this.writeConfig = writeConfig;
this.retryConfig = retryConfig;
this.taskPluginCollector = taskPluginCollector;
this.writerBuffer = new ArrayList<>(writeConfig.batchSize);
}
public void init() {
String database = writeConfig.database;
//neo4j 3.x 没有数据库
if (null != database && !"".equals(database)) {
this.session = driver.session(SessionConfig.forDatabase(database));
} else {
this.session = driver.session();
}
}
public static Neo4jClient build(Configuration config, TaskPluginCollector taskPluginCollector) {
Driver driver = buildNeo4jDriver(config);
String cypher = checkCypher(config);
String database = config.getString(DATABASE.getKey());
String batchVariableName = config.getString(BATCH_DATA_VARIABLE_NAME.getKey(),
BATCH_DATA_VARIABLE_NAME.getDefaultValue());
List<Neo4jProperty> neo4jProperties = JSON.parseArray(config.getString(NEO4J_PROPERTIES.getKey()), Neo4jProperty.class);
int batchSize = config.getInt(BATCH_SIZE.getKey(), BATCH_SIZE.getDefaultValue());
int retryTimes = config.getInt(RETRY_TIMES.getKey(), RETRY_TIMES.getDefaultValue());
return new Neo4jClient(driver,
new WriteConfig(cypher, database, batchVariableName, neo4jProperties, batchSize),
new RetryConfig(retryTimes, config.getLong(RETRY_SLEEP_MILLS.getKey(), RETRY_SLEEP_MILLS.getDefaultValue())),
taskPluginCollector
);
}
private static String checkCypher(Configuration config) {
String cypher = config.getString(CYPHER.getKey());
if (StringUtils.isBlank(cypher)) {
throw DataXException.asDataXException(Neo4jErrorCode.CONFIG_INVALID, "cypher must not null or empty");
}
return cypher;
}
private static Driver buildNeo4jDriver(Configuration config) {
Config.ConfigBuilder configBuilder = Config.builder().withMaxConnectionPoolSize(1);
String uri = checkUriConfig(config);
//connection timeout
//连接超时时间
Long maxConnTime = config.getLong(MAX_CONNECTION_TIMEOUT_SECONDS.getKey(), MAX_TRANSACTION_RETRY_TIME.getDefaultValue());
configBuilder
.withConnectionAcquisitionTimeout(
maxConnTime * 2, TimeUnit.SECONDS)
.withConnectionTimeout(maxConnTime, TimeUnit.SECONDS);
//transaction timeout
//事务运行超时时间
Long txRetryTime = config.getLong(MAX_TRANSACTION_RETRY_TIME.getKey(), MAX_TRANSACTION_RETRY_TIME.getDefaultValue());
configBuilder.withMaxTransactionRetryTime(txRetryTime, TimeUnit.SECONDS);
String username = config.getString(USERNAME.getKey());
String password = config.getString(PASSWORD.getKey());
String bearerToken = config.getString(BEARER_TOKEN.getKey());
String kerberosTicket = config.getString(KERBEROS_TICKET.getKey());
if (StringUtils.isNotBlank(username) && StringUtils.isNotBlank(password)) {
return GraphDatabase.driver(uri, AuthTokens.basic(username, password), configBuilder.build());
} else if (StringUtils.isNotBlank(bearerToken)) {
return GraphDatabase.driver(uri, AuthTokens.bearer(bearerToken), configBuilder.build());
} else if (StringUtils.isNotBlank(kerberosTicket)) {
return GraphDatabase.driver(uri, AuthTokens.kerberos(kerberosTicket), configBuilder.build());
}
throw DataXException.asDataXException(Neo4jErrorCode.CONFIG_INVALID, "Invalid Auth config.");
}
private static String checkUriConfig(Configuration config) {
String uri = config.getString(URI.getKey());
if (null == uri || uri.length() == 0) {
throw DataXException.asDataXException(Neo4jErrorCode.CONFIG_INVALID, "Invalid uri configuration");
}
return uri;
}
public void destroy() {
tryFlushBuffer();
if (driver != null) {
driver.close();
}
if (session != null) {
session.close();
}
DateAdapter.destroy();
}
private void tryFlushBuffer() {
if (!writerBuffer.isEmpty()) {
doWrite(writerBuffer);
writerBuffer.clear();
}
}
private void tryBatchWrite() {
if (!writerBuffer.isEmpty() && writerBuffer.size() >= writeConfig.batchSize) {
doWrite(writerBuffer);
writerBuffer.clear();
}
}
private void doWrite(List<MapValue> values) {
Value batchValues = Values.parameters(this.writeConfig.batchVariableName, values);
Query query = new Query(this.writeConfig.cypher, batchValues);
// LOGGER.debug("query:{}", query.text());
// LOGGER.debug("batch:{}", toUnwindStr(values));
try {
RetryUtil.executeWithRetry(() -> {
session.writeTransaction(tx -> tx.run(query));
return null;
}, this.retryConfig.retryTimes, retryConfig.retrySleepMills, true,
Collections.singletonList(Neo4jException.class));
} catch (Exception e) {
LOGGER.error("an exception occurred while writing to the database,message:{}", e.getMessage());
throw DataXException.asDataXException(DATABASE_ERROR, e.getMessage());
}
}
private String toUnwindStr(List<MapValue> values) {
StringJoiner joiner = new StringJoiner(",");
for (MapValue value : values) {
joiner.add(value.toString());
}
return "[" + joiner + "]";
}
public void tryWrite(Record record) {
MapValue neo4jValue = checkAndConvert(record);
writerBuffer.add(neo4jValue);
tryBatchWrite();
}
private MapValue checkAndConvert(Record record) {
int sourceColNum = record.getColumnNumber();
List<Neo4jProperty> neo4jProperties = writeConfig.neo4jProperties;
if (neo4jProperties == null || neo4jProperties.size() != sourceColNum) {
throw new DataXException(Neo4jErrorCode.CONFIG_INVALID, "the read and write columns do not match!");
}
Map<String, Value> data = new HashMap<>(sourceColNum * 4 / 3);
for (int i = 0; i < sourceColNum; i++) {
Column column = record.getColumn(i);
Neo4jProperty neo4jProperty = neo4jProperties.get(i);
try {
Value value = ValueAdapter.column2Value(column, neo4jProperty);
data.put(neo4jProperty.getName(), value);
} catch (Exception e) {
LOGGER.info("dirty record{},message :{}", column, e.getMessage());
this.taskPluginCollector.collectDirtyRecord(record, e.getMessage());
}
}
return new MapValue(data);
}
public List<Neo4jProperty> getNeo4jFields() {
return this.writeConfig.neo4jProperties;
}
static class RetryConfig {
int retryTimes;
long retrySleepMills;
RetryConfig(int retryTimes, long retrySleepMills) {
this.retryTimes = retryTimes;
this.retrySleepMills = retrySleepMills;
}
}
static class WriteConfig {
String cypher;
String database;
String batchVariableName;
List<Neo4jProperty> neo4jProperties;
int batchSize;
public WriteConfig(String cypher,
String database,
String batchVariableName,
List<Neo4jProperty> neo4jProperties,
int batchSize) {
this.cypher = cypher;
this.database = database;
this.batchVariableName = batchVariableName;
this.neo4jProperties = neo4jProperties;
this.batchSize = batchSize;
}
}
}

View File

@ -0,0 +1,64 @@
package com.alibaba.datax.plugin.writer.neo4jwriter;
import com.alibaba.datax.common.plugin.RecordReceiver;
import com.alibaba.datax.common.spi.Writer;
import com.alibaba.datax.common.util.Configuration;
import com.alibaba.datax.common.element.Record;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.List;
public class Neo4jWriter extends Writer {
public static class Job extends Writer.Job {
private static final Logger LOGGER = LoggerFactory.getLogger(Job.class);
private Configuration jobConf = null;
@Override
public void init() {
LOGGER.info("Neo4jWriter Job init success");
this.jobConf = getPluginJobConf();
}
@Override
public void destroy() {
LOGGER.info("Neo4jWriter Job destroyed");
}
@Override
public List<Configuration> split(int mandatoryNumber) {
List<Configuration> configurations = new ArrayList<Configuration>(mandatoryNumber);
for (int i = 0; i < mandatoryNumber; i++) {
configurations.add(this.jobConf.clone());
}
return configurations;
}
}
public static class Task extends Writer.Task {
private static final Logger TASK_LOGGER = LoggerFactory.getLogger(Task.class);
private Neo4jClient neo4jClient;
@Override
public void init() {
Configuration taskConf = super.getPluginJobConf();
this.neo4jClient = Neo4jClient.build(taskConf,getTaskPluginCollector());
this.neo4jClient.init();
TASK_LOGGER.info("neo4j writer task init success.");
}
@Override
public void destroy() {
this.neo4jClient.destroy();
TASK_LOGGER.info("neo4j writer task destroyed.");
}
@Override
public void startWrite(RecordReceiver receiver) {
Record record;
while ((record = receiver.getFromReader()) != null){
this.neo4jClient.tryWrite(record);
}
}
}
}

View File

@ -0,0 +1,70 @@
package com.alibaba.datax.plugin.writer.neo4jwriter.adapter;
import com.alibaba.datax.plugin.writer.neo4jwriter.config.Neo4jProperty;
import org.testcontainers.shaded.com.google.common.base.Supplier;
import java.time.LocalDate;
import java.time.LocalDateTime;
import java.time.LocalTime;
import java.time.format.DateTimeFormatter;
/**
* @author fuyouj
*/
public class DateAdapter {
private static final ThreadLocal<DateTimeFormatter> LOCAL_DATE_FORMATTER_MAP = new ThreadLocal<>();
private static final ThreadLocal<DateTimeFormatter> LOCAL_TIME_FORMATTER_MAP = new ThreadLocal<>();
private static final ThreadLocal<DateTimeFormatter> LOCAL_DATE_TIME_FORMATTER_MAP = new ThreadLocal<>();
private static final String DEFAULT_LOCAL_DATE_FORMATTER = "yyyy-MM-dd";
private static final String DEFAULT_LOCAL_TIME_FORMATTER = "HH:mm:ss";
private static final String DEFAULT_LOCAL_DATE_TIME_FORMATTER = "yyyy-MM-dd HH:mm:ss";
public static LocalDate localDate(String text, Neo4jProperty neo4jProperty) {
if (LOCAL_DATE_FORMATTER_MAP.get() != null) {
return LocalDate.parse(text, LOCAL_DATE_FORMATTER_MAP.get());
}
String format = getOrDefault(neo4jProperty::getDateFormat, DEFAULT_LOCAL_DATE_FORMATTER);
DateTimeFormatter dateTimeFormatter = DateTimeFormatter.ofPattern(format);
LOCAL_DATE_FORMATTER_MAP.set(dateTimeFormatter);
return LocalDate.parse(text, dateTimeFormatter);
}
public static String getOrDefault(Supplier<String> dateFormat, String defaultFormat) {
String format = dateFormat.get();
if (null == format || "".equals(format)) {
return defaultFormat;
} else {
return format;
}
}
public static void destroy() {
LOCAL_DATE_FORMATTER_MAP.remove();
LOCAL_TIME_FORMATTER_MAP.remove();
LOCAL_DATE_TIME_FORMATTER_MAP.remove();
}
public static LocalTime localTime(String text, Neo4jProperty neo4JProperty) {
if (LOCAL_TIME_FORMATTER_MAP.get() != null) {
return LocalTime.parse(text, LOCAL_TIME_FORMATTER_MAP.get());
}
String format = getOrDefault(neo4JProperty::getDateFormat, DEFAULT_LOCAL_TIME_FORMATTER);
DateTimeFormatter dateTimeFormatter = DateTimeFormatter.ofPattern(format);
LOCAL_TIME_FORMATTER_MAP.set(dateTimeFormatter);
return LocalTime.parse(text, dateTimeFormatter);
}
public static LocalDateTime localDateTime(String text, Neo4jProperty neo4JProperty) {
if (LOCAL_DATE_TIME_FORMATTER_MAP.get() != null){
return LocalDateTime.parse(text,LOCAL_DATE_TIME_FORMATTER_MAP.get());
}
String format = getOrDefault(neo4JProperty::getDateFormat, DEFAULT_LOCAL_DATE_TIME_FORMATTER);
DateTimeFormatter dateTimeFormatter = DateTimeFormatter.ofPattern(format);
LOCAL_DATE_TIME_FORMATTER_MAP.set(dateTimeFormatter);
return LocalDateTime.parse(text, dateTimeFormatter);
}
}

View File

@ -0,0 +1,95 @@
package com.alibaba.datax.plugin.writer.neo4jwriter.adapter;
import com.alibaba.datax.common.element.Column;
import com.alibaba.datax.plugin.writer.neo4jwriter.config.Neo4jProperty;
import com.alibaba.datax.plugin.writer.neo4jwriter.element.PropertyType;
import com.alibaba.fastjson2.JSON;
import org.neo4j.driver.Value;
import org.neo4j.driver.Values;
import org.neo4j.driver.internal.value.NullValue;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.function.Function;
/**
* @author fuyouj
*/
public class ValueAdapter {
public static Value column2Value(final Column column, final Neo4jProperty neo4JProperty) {
String typeStr = neo4JProperty.getType();
PropertyType type = PropertyType.fromStrIgnoreCase(typeStr);
if (column.asString() == null) {
return NullValue.NULL;
}
switch (type) {
case NULL:
return NullValue.NULL;
case MAP:
return Values.value(JSON.parseObject(column.asString(), Map.class));
case BOOLEAN:
return Values.value(column.asBoolean());
case STRING:
return Values.value(column.asString());
case INTEGER:
case LONG:
return Values.value(column.asLong());
case SHORT:
return Values.value(Short.valueOf(column.asString()));
case FLOAT:
case DOUBLE:
return Values.value(column.asDouble());
case BYTE_ARRAY:
return Values.value(parseArrayType(neo4JProperty, column.asString(), Byte::valueOf));
case CHAR_ARRAY:
return Values.value(parseArrayType(neo4JProperty, column.asString(), (s) -> s.charAt(0)));
case BOOLEAN_ARRAY:
return Values.value(parseArrayType(neo4JProperty, column.asString(), Boolean::valueOf));
case STRING_ARRAY:
case Object_ARRAY:
case LIST:
return Values.value(parseArrayType(neo4JProperty, column.asString(), Function.identity()));
case LONG_ARRAY:
return Values.value(parseArrayType(neo4JProperty, column.asString(), Long::valueOf));
case INT_ARRAY:
return Values.value(parseArrayType(neo4JProperty, column.asString(), Integer::valueOf));
case SHORT_ARRAY:
return Values.value(parseArrayType(neo4JProperty, column.asString(), Short::valueOf));
case DOUBLE_ARRAY:
case FLOAT_ARRAY:
return Values.value(parseArrayType(neo4JProperty, column.asString(), Double::valueOf));
case LOCAL_DATE:
return Values.value(DateAdapter.localDate(column.asString(), neo4JProperty));
case LOCAL_TIME:
return Values.value(DateAdapter.localTime(column.asString(), neo4JProperty));
case LOCAL_DATE_TIME:
return Values.value(DateAdapter.localDateTime(column.asString(), neo4JProperty));
default:
return Values.value(column.getRawData());
}
}
private static <R> List<R> parseArrayType(final Neo4jProperty neo4JProperty,
final String strValue,
final Function<String, R> convertFunc) {
if (null == strValue || "".equals(strValue)) {
return Collections.emptyList();
}
String split = neo4JProperty.getSplitOrDefault();
String[] strArr = strValue.split(split);
List<R> ans = new ArrayList<>();
for (String s : strArr) {
ans.add(convertFunc.apply(s));
}
return ans;
}
}

View File

@ -0,0 +1,116 @@
package com.alibaba.datax.plugin.writer.neo4jwriter.config;
import java.util.List;
/**
* @author fuyouj
*/
public final class ConfigConstants {
public static final Long DEFAULT_MAX_TRANSACTION_RETRY_SECONDS = 30L;
public static final Long DEFAULT_MAX_CONNECTION_SECONDS = 30L;
public static final Option<Integer> RETRY_TIMES =
Option.<Integer>builder()
.key("retryTimes")
.defaultValue(3)
.desc("The number of overwrites when an error occurs")
.build();
public static final Option<Long> RETRY_SLEEP_MILLS =
Option.<Long>builder()
.key("retrySleepMills")
.defaultValue(3000L)
.build();
/**
* cluster mode please reference
* <a href="https://neo4j.com/docs/java-manual/current/client-applications/">how to connect cluster mode</a>
*/
public static final Option<String> URI =
Option.<String>builder()
.key("uri")
.noDefaultValue()
.desc("uir of neo4j database")
.build();
public static final Option<String> USERNAME =
Option.<String>builder()
.key("username")
.noDefaultValue()
.desc("username for accessing the neo4j database")
.build();
public static final Option<String> PASSWORD =
Option.<String>builder()
.key("password")
.noDefaultValue()
.desc("password for accessing the neo4j database")
.build();
public static final Option<String> BEARER_TOKEN =
Option.<String>builder()
.key("bearerToken")
.noDefaultValue()
.desc("base64 encoded bearer token of the Neo4j. for Auth.")
.build();
public static final Option<String> KERBEROS_TICKET =
Option.<String>builder()
.key("kerberosTicket")
.noDefaultValue()
.desc("base64 encoded kerberos ticket of the Neo4j. for Auth.")
.build();
public static final Option<String> DATABASE =
Option.<String>builder()
.key("database")
.noDefaultValue()
.desc("database name.")
.build();
public static final Option<String> CYPHER =
Option.<String>builder()
.key("cypher")
.noDefaultValue()
.desc("cypher query.")
.build();
public static final Option<Long> MAX_TRANSACTION_RETRY_TIME =
Option.<Long>builder()
.key("maxTransactionRetryTimeSeconds")
.defaultValue(DEFAULT_MAX_TRANSACTION_RETRY_SECONDS)
.desc("maximum transaction retry time(seconds). transaction fail if exceeded.")
.build();
public static final Option<Long> MAX_CONNECTION_TIMEOUT_SECONDS =
Option.<Long>builder()
.key("maxConnectionTimeoutSeconds")
.defaultValue(DEFAULT_MAX_CONNECTION_SECONDS)
.desc("The maximum amount of time to wait for a TCP connection to be established (seconds).")
.build();
public static final Option<String> BATCH_DATA_VARIABLE_NAME =
Option.<String>builder()
.key("batchDataVariableName")
.defaultValue("batch")
.desc("in a cypher statement, a variable name that represents a batch of data")
.build();
public static final Option<List<Neo4jProperty>> NEO4J_PROPERTIES =
Option.<List<Neo4jProperty>>builder()
.key("properties")
.noDefaultValue()
.desc("neo4j node or relation`s props")
.build();
public static final Option<Integer> BATCH_SIZE =
Option.<Integer>builder().
key("batchSize")
.defaultValue(1000)
.desc("max batch size")
.build();
}

View File

@ -0,0 +1,82 @@
package com.alibaba.datax.plugin.writer.neo4jwriter.config;
/**
* 由于dataX并不能传输数据的元数据所以只能在writer端定义每列数据的名字
* datax does not support data metadata,
* only the name of each column of data can be defined on neo4j writer
*
* @author fuyouj
*/
public class Neo4jProperty {
public static final String DEFAULT_SPLIT = ",";
/**
* name of neo4j field
*/
private String name;
/**
* neo4j type
* reference by org.neo4j.driver.Values
*/
private String type;
/**
* for date
*/
private String dateFormat;
/**
* for array type
*/
private String split;
public Neo4jProperty() {
}
public Neo4jProperty(String name, String type, String format, String split) {
this.name = name;
this.type = type;
this.dateFormat = format;
this.split = split;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getType() {
return type;
}
public void setType(String type) {
this.type = type;
}
public String getDateFormat() {
return dateFormat;
}
public void setDateFormat(String dateFormat) {
this.dateFormat = dateFormat;
}
public String getSplit() {
return getSplitOrDefault();
}
public String getSplitOrDefault() {
if (split == null || "".equals(split)) {
return DEFAULT_SPLIT;
}
return split;
}
public void setSplit(String split) {
this.split = split;
}
}

View File

@ -0,0 +1,65 @@
package com.alibaba.datax.plugin.writer.neo4jwriter.config;
public class Option<T> {
public static class Builder<T> {
private String key;
private String desc;
private T defaultValue;
public Builder<T> key(String key) {
this.key = key;
return this;
}
public Builder<T> desc(String desc) {
this.desc = desc;
return this;
}
public Builder<T> defaultValue(T defaultValue) {
this.defaultValue = defaultValue;
return this;
}
public Builder<T> noDefaultValue() {
return this;
}
public Option<T> build() {
return new Option<>(this.key, this.desc, this.defaultValue);
}
}
private final String key;
private final String desc;
private final T defaultValue;
public Option(String key, String desc, T defaultValue) {
this.key = key;
this.desc = desc;
this.defaultValue = defaultValue;
}
public static <T> Builder<T> builder(){
return new Builder<>();
}
public String getKey() {
return key;
}
public String getDesc() {
return desc;
}
public T getDefaultValue() {
if (defaultValue == null){
throw new IllegalStateException(key + ":defaultValue is null");
}
return defaultValue;
}
}

View File

@ -0,0 +1,40 @@
package com.alibaba.datax.plugin.writer.neo4jwriter.element;
import java.util.Arrays;
/**
* @see org.neo4j.driver.Values
* @author fuyouj
*/
public enum PropertyType {
NULL,
BOOLEAN,
STRING,
LONG,
SHORT,
INTEGER,
DOUBLE,
FLOAT,
LOCAL_DATE,
LOCAL_TIME,
LOCAL_DATE_TIME,
LIST,
MAP,
CHAR_ARRAY,
BYTE_ARRAY,
BOOLEAN_ARRAY,
STRING_ARRAY,
LONG_ARRAY,
INT_ARRAY,
SHORT_ARRAY,
DOUBLE_ARRAY,
FLOAT_ARRAY,
Object_ARRAY;
public static PropertyType fromStrIgnoreCase(String typeStr) {
return Arrays.stream(PropertyType.values())
.filter(e -> e.name().equalsIgnoreCase(typeStr))
.findFirst()
.orElse(PropertyType.STRING);
}
}

View File

@ -0,0 +1,37 @@
package com.alibaba.datax.plugin.writer.neo4jwriter.exception;
import com.alibaba.datax.common.spi.ErrorCode;
public enum Neo4jErrorCode implements ErrorCode {
/**
* Invalid configuration
* 配置校验异常
*/
CONFIG_INVALID("NEO4J_ERROR_01","invalid configuration"),
/**
* database error
* 在执行写入到数据库时抛出的异常可能是权限异常也可能是连接超时或者是配置到了从节点
* 如果是更新操作还会有死锁异常具体原因根据报错信息确定但是这与dataX无关
*/
DATABASE_ERROR("NEO4J_ERROR_02","database error");
private final String code;
private final String description;
@Override
public String getCode() {
return code;
}
@Override
public String getDescription() {
return description;
}
Neo4jErrorCode(String code, String description) {
this.code = code;
this.description = description;
}
}

View File

@ -0,0 +1,6 @@
{
"name": "neo4jWriter",
"class": "com.alibaba.datax.plugin.writer.neo4jwriter.Neo4jWriter",
"description": "dataX neo4j 写插件",
"developer": "付有杰"
}

View File

@ -0,0 +1,42 @@
{
"uri": "neo4j://localhost:7687",
"username": "neo4j",
"password": "Test@12343",
"database": "neo4j",
"cypher": "unwind $batch as row create(p:Person) set p.pbool = row.pbool,p.pstring = row.pstring,p.plong = row.plong,p.pshort = row.pshort,p.pdouble=row.pdouble,p.pstringarr=row.pstringarr,p.plocaldate=row.plocaldate",
"batchDataVariableName": "batch",
"batchSize": "33",
"properties": [
{
"name": "pbool",
//type
"type": "BOOLEAN"
},
{
"name": "pstring",
"type": "STRING"
},
{
"name": "plong",
"type": "LONG"
},
{
"name": "pshort",
"type": "SHORT"
},
{
"name": "pdouble",
"type": "DOUBLE"
},
{
"name": "pstringarr",
"type": "STRING_ARRAY",
"split": ","
},
{
"name": "plocaldate",
"type": "LOCAL_DATE",
"dateFormat": "yyyy-MM-dd"
}
]
}

View File

@ -0,0 +1,257 @@
package com.alibaba.datax.plugin.writer;
import com.alibaba.datax.common.element.Record;
import com.alibaba.datax.common.element.StringColumn;
import com.alibaba.datax.common.util.Configuration;
import com.alibaba.datax.plugin.writer.mock.MockRecord;
import com.alibaba.datax.plugin.writer.mock.MockUtil;
import com.alibaba.datax.plugin.writer.neo4jwriter.Neo4jClient;
import com.alibaba.datax.plugin.writer.neo4jwriter.config.Neo4jProperty;
import com.alibaba.datax.plugin.writer.neo4jwriter.element.PropertyType;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import org.neo4j.driver.*;
import org.neo4j.driver.types.Node;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.testcontainers.containers.GenericContainer;
import org.testcontainers.containers.Network;
import org.testcontainers.containers.output.Slf4jLogConsumer;
import org.testcontainers.lifecycle.Startables;
import org.testcontainers.shaded.org.awaitility.Awaitility;
import org.testcontainers.utility.DockerImageName;
import org.testcontainers.utility.DockerLoggerFactory;
import java.io.File;
import java.net.URI;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.concurrent.TimeUnit;
import java.util.stream.Stream;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
public class Neo4jWriterTest {
private static final Logger LOGGER = LoggerFactory.getLogger(Neo4jWriterTest.class);
private static final int MOCK_NUM = 100;
private static final String CONTAINER_IMAGE = "neo4j:5.9.0";
private static final String CONTAINER_HOST = "neo4j-host";
private static final int HTTP_PORT = 7474;
private static final int BOLT_PORT = 7687;
private static final String CONTAINER_NEO4J_USERNAME = "neo4j";
private static final String CONTAINER_NEO4J_PASSWORD = "Test@12343";
private static final URI CONTAINER_URI = URI.create("neo4j://localhost:" + BOLT_PORT);
protected static final Network NETWORK = Network.newNetwork();
private GenericContainer<?> container;
private Driver neo4jDriver;
private Session neo4jSession;
@Before
public void init() {
DockerImageName imageName = DockerImageName.parse(CONTAINER_IMAGE);
container =
new GenericContainer<>(imageName)
.withNetwork(NETWORK)
.withNetworkAliases(CONTAINER_HOST)
.withExposedPorts(HTTP_PORT, BOLT_PORT)
.withEnv(
"NEO4J_AUTH",
CONTAINER_NEO4J_USERNAME + "/" + CONTAINER_NEO4J_PASSWORD)
.withEnv("apoc.export.file.enabled", "true")
.withEnv("apoc.import.file.enabled", "true")
.withEnv("apoc.import.file.use_neo4j_config", "true")
.withEnv("NEO4J_PLUGINS", "[\"apoc\"]")
.withLogConsumer(
new Slf4jLogConsumer(
DockerLoggerFactory.getLogger(CONTAINER_IMAGE)));
container.setPortBindings(
Arrays.asList(
String.format("%s:%s", HTTP_PORT, HTTP_PORT),
String.format("%s:%s", BOLT_PORT, BOLT_PORT)));
Startables.deepStart(Stream.of(container)).join();
LOGGER.info("container started");
Awaitility.given()
.ignoreExceptions()
.await()
.atMost(30, TimeUnit.SECONDS)
.untilAsserted(this::initConnection);
}
@Test
public void testCreateNodeAllTypeField() {
final Result checkExists = neo4jSession.run("MATCH (p:Person) RETURN p limit 1");
if (checkExists.hasNext()) {
neo4jSession.run("MATCH (p:Person) delete p");
}
Configuration configuration = Configuration.from(new File("src/test/resources/allTypeFieldNode.json"));
Neo4jClient neo4jClient = Neo4jClient.build(configuration, null);
neo4jClient.init();
for (int i = 0; i < MOCK_NUM; i++) {
neo4jClient.tryWrite(mockAllTypeFieldTestNode(neo4jClient.getNeo4jFields()));
}
neo4jClient.destroy();
Result result = neo4jSession.run("MATCH (p:Person) return p");
// nodes
assertTrue(result.hasNext());
int cnt = 0;
while (result.hasNext()) {
org.neo4j.driver.Record record = result.next();
record.get("p").get("pbool").asBoolean();
record.get("p").get("pstring").asString();
record.get("p").get("plong").asLong();
record.get("p").get("pshort").asInt();
record.get("p").get("pdouble").asDouble();
List list = (List) record.get("p").get("pstringarr").asObject();
record.get("p").get("plocaldate").asLocalDate();
cnt++;
}
assertEquals(cnt, MOCK_NUM);
}
/**
* 创建关系 必须先有节点
* 所以先创建节点再模拟关系
*/
@Test
public void testCreateRelation() {
final Result checkExists = neo4jSession.run("MATCH (p1:Person)-[r:LINK]->(p1:Person) return r limit 1");
if (checkExists.hasNext()) {
neo4jSession.run("MATCH (p1:Person)-[r:LINK]->(p1:Person) delete r,p1,p2");
}
String createNodeCql = "create (p:Person) set p.id = '%s'";
Configuration configuration = Configuration.from(new File("src/test/resources/relationship.json"));
Neo4jClient neo4jClient = Neo4jClient.build(configuration, null);
neo4jClient.init();
//创建节点为后续写关系做准备
//Create nodes to prepare for subsequent write relationships
for (int i = 0; i < MOCK_NUM; i++) {
neo4jSession.run(String.format(createNodeCql, i + "start"));
neo4jSession.run(String.format(createNodeCql, i + "end"));
Record record = new MockRecord();
record.addColumn(new StringColumn(i + "start"));
record.addColumn(new StringColumn(i + "end"));
neo4jClient.tryWrite(record);
}
neo4jClient.destroy();
Result result = neo4jSession.run("MATCH (start:Person)-[r:LINK]->(end:Person) return r,start,end");
// relationships
assertTrue(result.hasNext());
int cnt = 0;
while (result.hasNext()) {
org.neo4j.driver.Record record = result.next();
Node startNode = record.get("start").asNode();
assertTrue(startNode.hasLabel("Person"));
assertTrue(startNode.asMap().containsKey("id"));
Node endNode = record.get("end").asNode();
assertTrue(startNode.hasLabel("Person"));
assertTrue(endNode.asMap().containsKey("id"));
String name = record.get("r").type().name();
assertEquals("RELATIONSHIP", name);
cnt++;
}
assertEquals(cnt, MOCK_NUM);
}
/**
* neo4j中,Label和关系类型,想动态的写需要借助于apoc函数
*/
@Test
public void testUseApocCreateDynamicLabel() {
List<String> dynamicLabel = new ArrayList<>();
for (int i = 0; i < MOCK_NUM; i++) {
dynamicLabel.add("Label" + i);
}
//删除原有数据
//remove test data if exist
//这种占位符的方式不支持批量动态写,当然可以使用union拼接但是性能不好
String query = "match (p:%s) return p";
String delete = "match (p:%s) delete p";
for (String label : dynamicLabel) {
Result result = neo4jSession.run(String.format(query, label));
if (result.hasNext()) {
neo4jSession.run(String.format(delete, label));
}
}
Configuration configuration = Configuration.from(new File("src/test/resources/dynamicLabel.json"));
Neo4jClient neo4jClient = Neo4jClient.build(configuration, null);
neo4jClient.init();
for (int i = 0; i < dynamicLabel.size(); i++) {
Record record = new MockRecord();
record.addColumn(new StringColumn(dynamicLabel.get(i)));
record.addColumn(new StringColumn(String.valueOf(i)));
neo4jClient.tryWrite(record);
}
neo4jClient.destroy();
//校验脚本的批量写入是否正确
int cnt = 0;
for (int i = 0; i < dynamicLabel.size(); i++) {
String label = dynamicLabel.get(i);
Result result = neo4jSession.run(String.format(query, label));
while (result.hasNext()) {
org.neo4j.driver.Record record = result.next();
Node node = record.get("p").asNode();
assertTrue(node.hasLabel(label));
assertEquals(node.asMap().get("id"), i + "");
cnt++;
}
}
assertEquals(cnt, MOCK_NUM);
}
private Record mockAllTypeFieldTestNode(List<Neo4jProperty> neo4JProperties) {
Record mock = new MockRecord();
for (Neo4jProperty field : neo4JProperties) {
mock.addColumn(MockUtil.mockColumnByType(PropertyType.fromStrIgnoreCase(field.getType())));
}
return mock;
}
@After
public void destroy() {
if (neo4jSession != null) {
neo4jSession.close();
}
if (neo4jDriver != null) {
neo4jDriver.close();
}
if (container != null) {
container.close();
}
}
private void initConnection() {
neo4jDriver =
GraphDatabase.driver(
CONTAINER_URI,
AuthTokens.basic(CONTAINER_NEO4J_USERNAME, CONTAINER_NEO4J_PASSWORD));
neo4jSession = neo4jDriver.session(SessionConfig.forDatabase("neo4j"));
}
}

View File

@ -0,0 +1,104 @@
package com.alibaba.datax.plugin.writer.mock;
import com.alibaba.datax.common.element.Column;
import com.alibaba.datax.common.element.Record;
import com.alibaba.fastjson2.JSON;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class MockRecord implements Record {
private static final int RECORD_AVERGAE_COLUMN_NUMBER = 16;
private List<Column> columns;
private int byteSize;
private Map<String, Object> meta;
public MockRecord() {
this.columns = new ArrayList<>(RECORD_AVERGAE_COLUMN_NUMBER);
}
@Override
public void addColumn(Column column) {
columns.add(column);
incrByteSize(column);
}
@Override
public Column getColumn(int i) {
if (i < 0 || i >= columns.size()) {
return null;
}
return columns.get(i);
}
@Override
public void setColumn(int i, final Column column) {
if (i < 0) {
throw new IllegalArgumentException("不能给index小于0的column设置值");
}
if (i >= columns.size()) {
expandCapacity(i + 1);
}
decrByteSize(getColumn(i));
this.columns.set(i, column);
incrByteSize(getColumn(i));
}
@Override
public String toString() {
Map<String, Object> json = new HashMap<String, Object>();
json.put("size", this.getColumnNumber());
json.put("data", this.columns);
return JSON.toJSONString(json);
}
@Override
public int getColumnNumber() {
return this.columns.size();
}
@Override
public int getByteSize() {
return byteSize;
}
public int getMemorySize() {
throw new UnsupportedOperationException();
}
@Override
public void setMeta(Map<String, String> meta) {
}
@Override
public Map<String, String> getMeta() {
return null;
}
private void decrByteSize(final Column column) {
}
private void incrByteSize(final Column column) {
}
private void expandCapacity(int totalSize) {
if (totalSize <= 0) {
return;
}
int needToExpand = totalSize - columns.size();
while (needToExpand-- > 0) {
this.columns.add(null);
}
}
}

View File

@ -0,0 +1,50 @@
package com.alibaba.datax.plugin.writer.mock;
import com.alibaba.datax.common.element.*;
import com.alibaba.datax.plugin.writer.neo4jwriter.element.PropertyType;
import com.alibaba.fastjson2.JSON;
import java.time.LocalDate;
import java.time.format.DateTimeFormatter;
import java.util.HashMap;
import java.util.Map;
import java.util.Random;
public class MockUtil {
public static Column mockColumnByType(PropertyType type) {
Random random = new Random();
switch (type) {
case SHORT:
return new StringColumn("1");
case BOOLEAN:
return new BoolColumn(random.nextInt() % 2 == 0);
case INTEGER:
case LONG:
return new LongColumn(random.nextInt(Integer.MAX_VALUE));
case FLOAT:
case DOUBLE:
return new DoubleColumn(random.nextDouble());
case NULL:
return null;
case BYTE_ARRAY:
return new BytesColumn(new byte[]{(byte) (random.nextInt() % 2)});
case LOCAL_DATE:
return new StringColumn(LocalDate.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd")));
case MAP:
return new StringColumn(JSON.toJSONString(propmap()));
case STRING_ARRAY:
return new StringColumn("[1,1,1,1,1,1,1]");
default:
return new StringColumn("randomStr" + random.nextInt(Integer.MAX_VALUE));
}
}
public static Map<String, Object> propmap() {
Map<String, Object> prop = new HashMap<>();
prop.put("name", "neo4jWriter");
prop.put("age", "1");
return prop;
}
}

View File

@ -0,0 +1,41 @@
{
"uri": "neo4j://localhost:7687",
"username":"neo4j",
"password":"Test@12343",
"database":"neo4j",
"cypher": "unwind $batch as row create(p:Person) set p.pbool = row.pbool,p.pstring = row.pstring,p.plong = row.plong,p.pshort = row.pshort,p.pdouble=row.pdouble,p.pstringarr=row.pstringarr,p.plocaldate=row.plocaldate",
"batchDataVariableName": "batch",
"batchSize": "33",
"properties": [
{
"name": "pbool",
"type": "BOOLEAN"
},
{
"name": "pstring",
"type": "STRING"
},
{
"name": "plong",
"type": "LONG"
},
{
"name": "pshort",
"type": "SHORT"
},
{
"name": "pdouble",
"type": "DOUBLE"
},
{
"name": "pstringarr",
"type": "STRING_ARRAY",
"split": ","
},
{
"name": "plocaldate",
"type": "LOCAL_DATE",
"dateFormat": "yyyy-MM-dd"
}
]
}

View File

@ -0,0 +1,19 @@
{
"uri": "bolt://localhost:7687",
"username":"neo4j",
"password":"Test@12343",
"database":"neo4j",
"cypher": "unwind $batch as row CALL apoc.cypher.doIt( 'create (n:`' + row.Label + '`{id:$id})' ,{id: row.id} ) YIELD value RETURN 1 ",
"batchDataVariableName": "batch",
"batchSize": "33",
"properties": [
{
"name": "Label",
"type": "string"
},
{
"name": "id",
"type": "STRING"
}
]
}

View File

@ -0,0 +1,19 @@
{
"uri": "neo4j://localhost:7687",
"username":"neo4j",
"password":"Test@12343",
"database":"neo4j",
"cypher": "unwind $batch as row match(p1:Person) where p1.id = row.startNodeId match(p2:Person) where p2.id = row.endNodeId create (p1)-[:LINK]->(p2)",
"batchDataVariableName": "batch",
"batchSize": "33",
"properties": [
{
"name": "startNodeId",
"type": "STRING"
},
{
"name": "endNodeId",
"type": "STRING"
}
]
}

Some files were not shown because too many files have changed in this diff Show More