5
0
mirror of https://github.com/apache/sqoop.git synced 2025-05-01 22:30:42 +08:00

SQOOP-3381: Upgrade the Parquet library from 1.6.0 to 1.9.0

(Fero Szabo via Szabolcs Vasas)
This commit is contained in:
Szabolcs Vasas 2018-10-17 10:16:40 +02:00
parent c329f360dd
commit 5dd8c8aad1
19 changed files with 199 additions and 54 deletions

View File

@ -81,6 +81,7 @@ configurations.all {
force group: 'org.apache.avro', name: 'avro', version: avroVersion
force group: 'org.apache.avro', name: 'avro-mapred', version: avroVersion
force group: 'com.google.guava', name: 'guava', version: guavaVersion
force group: 'com.google.protobuf', name: 'protobuf-java', version: hiveProtobufVersion
}
exclude group: 'org.apache.hadoop', module: 'avro'
}
@ -102,32 +103,39 @@ dependencies {
common group: 'org.apache.accumulo', name: 'accumulo-core', version: accumuloVersion
common group: 'org.apache.accumulo', name: 'accumulo-minicluster', version: accumuloVersion
common group: 'org.eclipse.jetty', name: 'jetty-runner', version: jettyVersion
common group: 'org.apache.hbase', name: 'hbase-hadoop2-compat', version: hbaseVersion
common group: 'org.apache.hbase', name: 'hbase-server', version: hbaseVersion
common group: 'org.apache.hbase', name: 'hbase-client', version: hbaseVersion
common group: 'org.apache.hbase', name: 'hbase-common', version: hbaseVersion
common (group: 'org.apache.hive.hcatalog', name: 'hive-hcatalog-core', version: hcatalogVersion) {
common(group: 'org.apache.hive.hcatalog', name: 'hive-hcatalog-core', version: hcatalogVersion) {
exclude group: 'org.apache.avro', module: 'avro'
exclude group: 'org.apache.hive', module: 'hive-exec'
}
common group: 'org.apache.hive', name: 'hive-exec', version: hcatalogVersion, classifier: 'core'
common(group: 'org.apache.hive', name: 'hive-exec', version: hcatalogVersion, classifier: 'core') {
exclude group : 'org.apache.calcite', module :'calcite-core'
}
// Kryo and calcite are dependencies of hive-exec:core
common group: 'com.esotericsoftware.kryo', name: 'kryo', version: kryoVersion
common group: 'com.esotericsoftware', name: 'kryo', version: kryoVersion
common group: 'org.apache.calcite', name: 'calcite-core', version: calciteVersion
common (group: 'org.apache.hive', name: 'hive-jdbc', version: hcatalogVersion) {
common(group: 'org.apache.hive', name: 'hive-jdbc', version: hcatalogVersion) {
exclude group: 'org.apache.avro', module: 'avro'
exclude group: 'asm', module: 'asm'
}
common group: 'commons-cli', name: 'commons-cli', version: commonscliVersion
common group: 'commons-logging', name: 'commons-logging', version: commonsloggingVersion
common group: 'commons-net', name: 'commons-net', version: commonsnetVersion
common group: 'log4j', name: 'log4j', version: log4jVersion
common group: 'org.postgresql', name: 'postgresql', version: postgresqlVersion
common group: 'org.apache.parquet', name: 'parquet-hadoop-bundle', version: parquetVersion
testCompile group: 'com.h2database', name: 'h2', version: h2Version
testCompile group: 'org.apache.hbase', name: 'hbase-server', version: hbaseVersion, classifier: 'tests'
testCompile group: 'org.apache.hbase', name: 'hbase-hadoop2-compat', version: hbaseVersion, classifier: 'tests'
testCompile group: 'org.apache.hbase', name: 'hbase-hadoop-compat', version: hbaseVersion, classifier: 'tests'
testCompile( group: 'org.apache.hadoop', name: 'hadoop-minikdc', version: hadoopVersion) {
testCompile (group: 'org.apache.hadoop', name: 'hadoop-minikdc', version: hadoopVersion) {
exclude group: 'org.apache.directory.api', module: 'api-ldap-schema-data'
}
testCompile group: 'junit', name: 'junit', version: junitVersion

View File

@ -20,14 +20,16 @@
javaSourceCompatibilityVersion=1.8
avroVersion=1.8.1
parquetVersion=1.6.0
parquetVersion=1.9.0
hadoopVersion=2.8.0
aspectjVersion=1.7.4
zookeeperVersion=3.4.6
hbaseVersion=1.2.4
hcatalogVersion=1.2.1
kryoVersion=2.22
calciteVersion=1.2.0-incubating
hcatalogVersion=2.1.1
kryoVersion=3.0.3
calciteVersion=1.6.0
# Hive 2.1.1 transitively depends on protobuff 2.5.0
hiveProtobufVersion=2.5.0
guavaVersion=14.0.1
accumuloVersion=1.6.2
@ -54,7 +56,7 @@ version=1.5.0-SNAPSHOT
postgresqlVersion=9.2-1003-jdbc4
jettyVersion=9.3.20.v20170531
oldHash=b0f391e75154be86f95378ab141f6dd1b3b59475
oldVersion=1.4.7
org.gradle.daemon=true

View File

@ -39,7 +39,7 @@ dependencies {
}
redist group: 'hsqldb', name: 'hsqldb', version: hsqldbVersion
redist group: 'org.apache.commons', name: 'commons-lang3', version: commonslang3Version
redist group: 'com.twitter', name: 'parquet-avro', version: parquetVersion
redist group: 'org.apache.parquet', name: 'parquet-avro', version: parquetVersion
}
//Jar tasks

26
ivy.xml
View File

@ -61,7 +61,7 @@ under the License.
<artifact conf="master"/>
</publications>
<dependencies>
<!-- Dependencies for Hadoop 2.6.0 -->
<!-- Dependencies for Hadoop -->
<dependency org="org.apache.hadoop" name="hadoop-common"
rev="${hadoop.version}" conf="common->default">
<artifact name="hadoop-common" type="jar" />
@ -116,7 +116,7 @@ under the License.
conf="common->default;redist->default"/>
<dependency org="org.apache.commons" name="commons-lang3" rev="${commons-lang3.version}"
conf="common->default;redist->default"/>
<dependency org="com.twitter" name="parquet-avro" rev="${parquet.version}" conf="common->default;redist->default"/>
<dependency org="org.apache.parquet" name="parquet-avro" rev="${parquet.version}" conf="common->default;redist->default"/>
<dependency org="com.fasterxml.jackson.core" name="jackson-databind" rev="${jackson-databind.version}"
conf="common->default;redist->default" />
@ -135,7 +135,6 @@ under the License.
<dependency org="commons-collections" name="commons-collections"
rev="${commons-collections.version}" conf="releaseaudit->default"/>
<!-- Accumulo 1.5.0 -->
<dependency org="org.apache.accumulo" name="accumulo-core" rev="${accumulo.version}"
conf="common->default">
</dependency>
@ -143,7 +142,6 @@ under the License.
conf="common->default">
</dependency>
<!-- HBase 0.95 -->
<dependency org="org.apache.hbase" name="hbase-client" rev="${hbase.version}" conf="common->default">
<artifact name="hbase-client" type="jar"/>
<artifact name="hbase-client" type="test-jar" ext="jar" m:classifier="tests"/>
@ -200,19 +198,33 @@ under the License.
<exclude org="org.apache.avro" module="avro" />
</dependency>
<dependency org="org.apache.hive" name="hive-jdbc" rev="${hcatalog.version}" conf="common->default" />
<dependency org="org.apache.hive" name="hive-jdbc" rev="${hcatalog.version}" conf="common->default" >
<exclude org="com.twitter" module="parquet-hadoop-bundle"/>
</dependency>
<dependency org="org.apache.hive.hcatalog" name="hive-hcatalog-core"
rev="${hcatalog.version}" conf="common->default">
<artifact name="hive-hcatalog-core" type="jar"/>
<exclude org="org.apache.avro" module="avro" />
<exclude org="org.apache.hive" module="hive-exec" />
<exclude org="com.twitter" module="parquet-hadoop-bundle"/>
</dependency>
<dependency org="org.apache.hive" name="hive-exec" rev="${hcatalog.version}" conf="common->default" m:classifier="core" >
<exclude org="com.twitter" module="parquet-hadoop-bundle"/>
</dependency>
<dependency org="org.apache.hive" name="hive-exec" rev="${hcatalog.version}" conf="common->default" m:classifier="core" />
<!-- Kryo and calcite are dependencies of hive-exec:core -->
<dependency org="com.esotericsoftware.kryo" name="kryo" rev="${kryo.version}" conf="common->default" />
<dependency org="com.esotericsoftware" name="kryo" rev="${kryo.version}" conf="common->default" />
<dependency org="org.apache.calcite" name="calcite-core" rev="${calcite.version}" conf="common->default" />
<dependency org="com.google.protobuf" name="protobuf-java" rev="${hive.protobuf.version}" force="true" conf="common->default" />
<dependency org="org.eclipse.jetty" name="jetty-runner" rev="${jetty.version}" conf="common->default"/>
<!-- HCatalog test cases need log4j-api 2 and log4j-core 2 dependencies -->
<dependency org="org.apache.logging.log4j" name="log4j-api" rev="${log4j-2.version}" conf="common->default" />
<dependency org="org.apache.logging.log4j" name="log4j-core" rev="${log4j-2.version}" conf="common->default" />
<dependency org="com.sun.jersey" name="jersey-server" rev="${jersey.version}" conf="common->default" />
<dependency org="com.sun.jersey" name="jersey-core" rev="${jersey.version}" conf="common->default" />
<dependency org="com.sun.jersey" name="jersey-servlet" rev="${jersey.version}" conf="common->default" />
<dependency org="org.postgresql" name="postgresql"
rev="${postgresql.version}" conf="common->default" />

View File

@ -19,6 +19,8 @@
# It drives ivy and the generation of a maven POM
avro.version=1.8.1
parquet.version=1.9.0
checkstyle.version=5.0
@ -43,6 +45,7 @@ mockito-all.version=1.9.5
h2.version=1.3.170
log4j.version=1.2.16
log4j-2.version=2.8.2
mvn.version=2.0.10
@ -57,9 +60,12 @@ slf4j.version=1.7.7
hadoop.version=2.8.0
hbase.version=1.2.4
hcatalog.version=1.2.1
kryo.version=2.22
calcite.version=1.2.0-incubating
hcatalog.version=2.1.1
kryo.version=3.0.3
calcite.version=1.6.0
hive.protobuf.version=2.5.0
jetty.version=9.3.20.v20170531
jersey.version=1.19.4
jackson-databind.version=2.9.5
parquet.version=1.6.0

View File

@ -40,11 +40,11 @@
import org.apache.sqoop.lib.BlobRef;
import org.apache.sqoop.lib.ClobRef;
import org.apache.sqoop.orm.ClassWriter;
import parquet.avro.AvroSchemaConverter;
import parquet.format.converter.ParquetMetadataConverter;
import parquet.hadoop.ParquetFileReader;
import parquet.hadoop.metadata.ParquetMetadata;
import parquet.schema.MessageType;
import org.apache.parquet.avro.AvroSchemaConverter;
import org.apache.parquet.format.converter.ParquetMetadataConverter;
import org.apache.parquet.hadoop.ParquetFileReader;
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
import org.apache.parquet.schema.MessageType;
import java.io.IOException;
import java.math.BigDecimal;

View File

@ -25,6 +25,7 @@
import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.security.Policy;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;
@ -34,6 +35,7 @@
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.util.Shell;
import org.apache.sqoop.mapreduce.hcat.DerbyPolicy;
import org.apache.sqoop.util.Executor;
import org.apache.sqoop.util.LoggingAsyncSink;
import org.apache.sqoop.util.SubprocessSecurityManager;
@ -239,6 +241,7 @@ public void importTable(String inputTableName, String outputTableName,
private void executeScript(String filename, List<String> env)
throws IOException {
SubprocessSecurityManager subprocessSM = null;
Policy originalPolicy = Policy.getPolicy();
if (testMode) {
// We use external mock hive process for test mode as
@ -263,6 +266,8 @@ private void executeScript(String filename, List<String> env)
subprocessSM = new SubprocessSecurityManager();
subprocessSM.install();
Policy.setPolicy(new DerbyPolicy());
String[] argv = getHiveArgs("-f", filename);
// And invoke the static method on this array.
@ -300,6 +305,7 @@ private void executeScript(String filename, List<String> env)
if (null != subprocessSM) {
// Uninstall the SecurityManager used to trap System.exit().
subprocessSM.uninstall();
Policy.setPolicy(originalPolicy);
}
}
}

View File

@ -0,0 +1,96 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.sqoop.mapreduce.hcat;
import org.apache.derby.security.SystemPermission;
import java.security.CodeSource;
import java.security.Permission;
import java.security.PermissionCollection;
import java.security.Policy;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Enumeration;
import java.util.List;
/**
*
* Initially copied from Hive.
*
* A security policy that grants usederbyinternals
*
* <p>
* HCatalog tests use Security Manager to handle exits. With Derby version 10.14.1, if a
* security manager is configured, embedded Derby requires usederbyinternals permission, and
* that is checked directly using AccessController.checkPermission. This class will be used to
* setup a security policy to grant usederbyinternals, in tests that use NoExitSecurityManager.
* </p>
*/
public class DerbyPolicy extends Policy {
private static PermissionCollection perms;
public DerbyPolicy() {
super();
if (perms == null) {
perms = new DerbyPermissionCollection();
addPermissions();
}
}
@Override
public PermissionCollection getPermissions(CodeSource codesource) {
return perms;
}
private void addPermissions() {
SystemPermission systemPermission = new SystemPermission("engine", "usederbyinternals");
perms.add(systemPermission);
}
class DerbyPermissionCollection extends PermissionCollection {
List<Permission> perms = new ArrayList<>();
@Override
public void add(Permission p) {
perms.add(p);
}
@Override
public boolean implies(Permission p) {
for (Permission perm : perms) {
if (perm.implies(p)) {
return true;
}
}
return false;
}
@Override
public Enumeration<Permission> elements() {
return Collections.enumeration(perms);
}
@Override
public boolean isReadOnly() {
return false;
}
}
}

View File

@ -25,6 +25,7 @@
import java.io.OutputStreamWriter;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.security.Policy;
import java.sql.Types;
import java.util.ArrayList;
import java.util.Arrays;
@ -1199,12 +1200,14 @@ public void executeExternalHCatProgram(List<String> env, String[] cmdLine)
void executeHCatProgramInProcess(String[] argv) throws IOException {
SubprocessSecurityManager subprocessSM = null;
Policy originalPolicy = Policy.getPolicy();
final ClassLoader originalClassLoader = Thread.currentThread().getContextClassLoader();
try {
Class<?> cliDriverClass = Class.forName(HCAT_CLI_MAIN_CLASS);
subprocessSM = new SubprocessSecurityManager();
subprocessSM.install();
Policy.setPolicy(new DerbyPolicy());
Method mainMethod = cliDriverClass.getMethod("main", argv.getClass());
mainMethod.invoke(null, (Object) argv);
} catch (ClassNotFoundException cnfe) {
@ -1230,6 +1233,7 @@ void executeHCatProgramInProcess(String[] argv) throws IOException {
if (null != subprocessSM) {
subprocessSM.uninstall();
}
Policy.setPolicy(originalPolicy);
Thread.currentThread().setContextClassLoader(originalClassLoader);
}
}

View File

@ -23,7 +23,7 @@
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.sqoop.mapreduce.parquet.ParquetExportJobConfigurator;
import parquet.avro.AvroParquetInputFormat;
import org.apache.parquet.avro.AvroParquetInputFormat;
import java.io.IOException;

View File

@ -27,9 +27,9 @@
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.sqoop.SqoopOptions;
import org.apache.sqoop.mapreduce.parquet.ParquetImportJobConfigurator;
import parquet.avro.AvroParquetOutputFormat;
import parquet.hadoop.ParquetOutputFormat;
import parquet.hadoop.metadata.CompressionCodecName;
import org.apache.parquet.avro.AvroParquetOutputFormat;
import org.apache.parquet.hadoop.ParquetOutputFormat;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import java.io.IOException;

View File

@ -30,7 +30,7 @@
import org.apache.hadoop.mapreduce.Job;
import org.apache.sqoop.mapreduce.MergeParquetMapper;
import org.apache.sqoop.mapreduce.parquet.ParquetMergeJobConfigurator;
import parquet.avro.AvroParquetInputFormat;
import org.apache.parquet.avro.AvroParquetInputFormat;
import java.io.IOException;

View File

@ -29,7 +29,7 @@
import org.junit.Test;
import org.junit.rules.ExpectedException;
import parquet.avro.AvroParquetWriter;
import org.apache.parquet.avro.AvroParquetWriter;
import java.io.IOException;
import java.nio.ByteBuffer;
@ -44,9 +44,9 @@
import static org.junit.Assert.assertArrayEquals;
import static org.junit.Assert.assertEquals;
import static parquet.hadoop.ParquetWriter.DEFAULT_BLOCK_SIZE;
import static parquet.hadoop.ParquetWriter.DEFAULT_PAGE_SIZE;
import static parquet.hadoop.metadata.CompressionCodecName.SNAPPY;
import static org.apache.parquet.hadoop.ParquetWriter.DEFAULT_BLOCK_SIZE;
import static org.apache.parquet.hadoop.ParquetWriter.DEFAULT_PAGE_SIZE;
import static org.apache.parquet.hadoop.metadata.CompressionCodecName.SNAPPY;
/**

View File

@ -18,6 +18,7 @@
package org.apache.sqoop;
import org.apache.avro.util.Utf8;
import org.apache.sqoop.testutil.CommonArgs;
import org.apache.sqoop.testutil.HsqldbTestServer;
import org.apache.sqoop.testutil.ImportJobTestCase;
@ -165,7 +166,7 @@ private void runParquetImportTest(String codec, String expectedCodec) throws IOE
assertEquals("DATA_COL2", 200L, record1.get("DATA_COL2"));
assertEquals("DATA_COL3", 1.0f, record1.get("DATA_COL3"));
assertEquals("DATA_COL4", 2.0, record1.get("DATA_COL4"));
assertEquals("DATA_COL5", "s", record1.get("DATA_COL5"));
assertEquals("DATA_COL5", new Utf8("s"), record1.get("DATA_COL5"));
Object object = record1.get("DATA_COL6");
assertTrue(object instanceof ByteBuffer);
ByteBuffer b = ((ByteBuffer) object);
@ -191,7 +192,7 @@ public void testOverrideTypeMapping() throws IOException {
List<GenericRecord> genericRecords = new ParquetReader(getTablePath()).readAll();
GenericRecord record1 = genericRecords.get(0);
assertEquals("DATA_COL0", "10", record1.get("DATA_COL0"));
assertEquals("DATA_COL0", new Utf8("10"), record1.get("DATA_COL0"));
assertEquals(1, genericRecords.size());
}

View File

@ -24,14 +24,14 @@
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.ExpectedException;
import parquet.hadoop.metadata.CompressionCodecName;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import java.util.Arrays;
import java.util.List;
import static java.util.Arrays.asList;
import static org.junit.Assert.assertEquals;
import static parquet.hadoop.metadata.CompressionCodecName.GZIP;
import static org.apache.parquet.hadoop.metadata.CompressionCodecName.GZIP;
public class TestParquetIncrementalImportMerge extends ImportJobTestCase {

View File

@ -37,7 +37,7 @@
import org.junit.runner.RunWith;
import org.junit.runners.Parameterized;
import org.junit.runners.Parameterized.Parameters;
import parquet.hadoop.metadata.CompressionCodecName;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import java.io.IOException;
import java.util.Arrays;

View File

@ -81,6 +81,8 @@ private void createHiveConf() {
config.set(HiveConf.ConfVars.HIVE_SERVER2_THRIFT_BIND_HOST.varname, getHostName());
config.setInt(HiveConf.ConfVars.HIVE_SERVER2_THRIFT_PORT.varname, getPort());
config.set(HiveConf.ConfVars.METASTORECONNECTURLKEY.varname, getMetastoreConnectUrl());
// setting port to -1 to turn the webui off
config.setInt(HiveConf.ConfVars.HIVE_SERVER2_WEBUI_PORT.varname, -1);
for (Map.Entry<String, String> authConfig : authenticationConfiguration.getAuthenticationConfig().entrySet()) {
config.set(authConfig.getKey(), authConfig.getValue());

View File

@ -23,13 +23,13 @@
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import parquet.avro.AvroParquetReader;
import parquet.hadoop.Footer;
import parquet.hadoop.ParquetFileReader;
import parquet.hadoop.metadata.BlockMetaData;
import parquet.hadoop.metadata.ColumnChunkMetaData;
import parquet.hadoop.metadata.CompressionCodecName;
import parquet.hadoop.util.HiddenFileFilter;
import org.apache.parquet.avro.AvroParquetReader;
import org.apache.parquet.hadoop.Footer;
import org.apache.parquet.hadoop.ParquetFileReader;
import org.apache.parquet.hadoop.metadata.BlockMetaData;
import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import org.apache.parquet.hadoop.util.HiddenFileFilter;
import java.io.IOException;
import java.util.ArrayDeque;
@ -52,7 +52,7 @@ public class ParquetReader implements AutoCloseable {
private final Deque<Path> filesToRead;
private parquet.hadoop.ParquetReader<GenericRecord> reader;
private org.apache.parquet.hadoop.ParquetReader<GenericRecord> reader;
public ParquetReader(Path pathToRead, Configuration configuration) {
this.pathToRead = pathToRead;

View File

@ -40,4 +40,12 @@
<name>hive.querylog.location</name>
<value>${test.build.data}/sqoop/logs</value>
</property>
<property>
<name>datanucleus.schema.autoCreateAll</name>
<value>true</value>
</property>
<property>
<name>hive.metastore.schema.verification</name>
<value>false</value>
</property>
</configuration>