SQOOP-2788: Sqoop2: Parquet support for HdfsConnector

(Abraham Fine via Jarek Jarcec Cecho)
2025-05-05 01:51:37 +08:00 · 2016-02-05 12:51:08 -08:00 · 2016-02-05 12:51:08 -08:00 · 55d1db2ba3
commit 55d1db2ba3
parent 2f4da466ef
16 changed files with 577 additions and 48 deletions
--- a/common/src/main/resources/org.apache.sqoop.connector-classloader.properties
+++ b/common/src/main/resources/org.apache.sqoop.connector-classloader.properties
@ -52,6 +52,8 @@ system.classes.default=java.,\
  org.apache.log4j.,\
  org.apache.sqoop.,\
  -org.apache.sqoop.connector.,\
  org.apache.avro.,\
  org.codehaus.jackson.,\
  org.xerial.snappy.,\
  sqoop.properties,\
  sqoop_bootstrap.properties
--- a/connector/connector-hdfs/pom.xml
+++ b/connector/connector-hdfs/pom.xml
@ -73,6 +73,16 @@ limitations under the License.
      <scope>provided</scope>
    </dependency>
    <dependency>
      <groupId>org.apache.parquet</groupId>
      <artifactId>parquet-hadoop</artifactId>
    </dependency>
    <dependency>
      <groupId>org.apache.parquet</groupId>
      <artifactId>parquet-avro</artifactId>
    </dependency>
  </dependencies>
  <build>
--- a/connector/connector-hdfs/src/main/java/org/apache/sqoop/connector/hdfs/HdfsExtractor.java
+++ b/connector/connector-hdfs/src/main/java/org/apache/sqoop/connector/hdfs/HdfsExtractor.java
@ -19,10 +19,14 @@
 import java.io.IOException;
 import java.io.UnsupportedEncodingException;
 import java.nio.charset.Charset;
 import java.security.PrivilegedExceptionAction;
 import java.util.Arrays;
 import org.apache.avro.generic.GenericRecord;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FSDataInputStream;
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.Seekable;
@ -33,13 +37,18 @@
 import org.apache.hadoop.io.compress.CompressionCodecFactory;
 import org.apache.hadoop.mapreduce.lib.input.FileSplit;
 import org.apache.hadoop.mapreduce.lib.input.SequenceFileRecordReader;
 import org.apache.hadoop.mapreduce.TaskAttemptContext;
 import org.apache.hadoop.mapreduce.RecordReader;
 import org.apache.hadoop.util.LineReader;
 import org.apache.log4j.Logger;
 import org.apache.parquet.avro.AvroReadSupport;
 import org.apache.parquet.hadoop.ParquetInputFormat;
 import org.apache.sqoop.common.SqoopException;
 import org.apache.sqoop.connector.common.SqoopIDFUtils;
 import org.apache.sqoop.connector.hadoop.security.SecurityUtils;
 import org.apache.sqoop.connector.hdfs.configuration.FromJobConfiguration;
 import org.apache.sqoop.connector.hdfs.configuration.LinkConfiguration;
 import org.apache.sqoop.connector.idf.AVROIntermediateDataFormat;
 import org.apache.sqoop.error.code.HdfsConnectorError;
 import org.apache.sqoop.etl.io.DataWriter;
 import org.apache.sqoop.job.etl.Extractor;
@ -55,6 +64,10 @@ public class HdfsExtractor extends Extractor<LinkConfiguration, FromJobConfigura
  public static final Logger LOG = Logger.getLogger(HdfsExtractor.class);
  // the sequence of bytes that appears at the beginning and end of every
  // parquet file
  private static final byte[] PARQUET_MAGIC = "PAR1".getBytes(Charset.forName("ASCII"));
  private Configuration conf = new Configuration();
  private DataWriter dataWriter;
  private Schema schema;
@ -85,7 +98,7 @@ public Void run() throws Exception {
  private void extractFile(LinkConfiguration linkConfiguration,
                           FromJobConfiguration fromJobConfiguration,
                           Path file, long start, long length, String[] locations)
-      throws IOException {
+    throws IOException, InterruptedException {
    long end = start + length;
    LOG.info("Extracting file " + file);
    LOG.info("\t from offset " + start);
@ -93,8 +106,10 @@ private void extractFile(LinkConfiguration linkConfiguration,
    LOG.info("\t of length " + length);
    if(isSequenceFile(file)) {
      extractSequenceFile(linkConfiguration, fromJobConfiguration, file, start, length, locations);
-    } else {
+    } else if(isParquetFile(file)) {
-      extractTextFile(linkConfiguration, fromJobConfiguration, file, start, length, locations);
+      extractParquetFile(linkConfiguration, fromJobConfiguration, file, start, length, locations);
      } else {
      extractTextFile(linkConfiguration, fromJobConfiguration, file, start, length);
    }
  }
@ -136,7 +151,7 @@ private void extractSequenceFile(LinkConfiguration linkConfiguration,
  @SuppressWarnings("resource")
  private void extractTextFile(LinkConfiguration linkConfiguration,
                               FromJobConfiguration fromJobConfiguration,
-                               Path file, long start, long length, String[] locations)
+                               Path file, long start, long length)
      throws IOException {
    LOG.info("Extracting text file");
    long end = start + length;
@ -185,6 +200,35 @@ private void extractTextFile(LinkConfiguration linkConfiguration,
    filestream.close();
  }
  private void extractParquetFile(LinkConfiguration linkConfiguration,
                                  FromJobConfiguration fromJobConfiguration,
                                  Path file, long start, long length,
                                  String[] locations) throws IOException, InterruptedException {
    // Parquet does not expose a way to directly deal with file splits
    // except through the ParquetInputFormat (ParquetInputSplit is @private)
    FileSplit fileSplit = new FileSplit(file, start, length, locations);
    conf.set(ParquetInputFormat.READ_SUPPORT_CLASS, AvroReadSupport.class.getName());
    ParquetInputFormat parquetInputFormat = new ParquetInputFormat();
    // ParquetReader needs a TaskAttemptContext to pass through the
    // configuration object.
    TaskAttemptContext taskAttemptContext = new SqoopTaskAttemptContext(conf);
    RecordReader<Void, GenericRecord> recordReader = parquetInputFormat.createRecordReader(fileSplit, taskAttemptContext);
    recordReader.initialize(fileSplit, taskAttemptContext);
    AVROIntermediateDataFormat idf = new AVROIntermediateDataFormat(schema);
    while (recordReader.nextKeyValue() != false) {
      GenericRecord record = recordReader.getCurrentValue();
      rowsRead++;
      if (schema instanceof ByteArraySchema) {
        dataWriter.writeArrayRecord(new Object[]{idf.toObject(record)});
      } else {
        dataWriter.writeArrayRecord(idf.toObject(record));
      }
    }
  }
  @Override
  public long getRowsRead() {
    return rowsRead;
@ -207,6 +251,41 @@ private boolean isSequenceFile(Path file) {
    return true;
  }
  private boolean isParquetFile(Path file) {
    try {
      FileSystem fileSystem = file.getFileSystem(conf);
      FileStatus fileStatus = fileSystem.getFileStatus(file);
      FSDataInputStream fsDataInputStream = fileSystem.open(file);
      long fileLength = fileStatus.getLen();
      byte[] fileStart = new byte[PARQUET_MAGIC.length];
      fsDataInputStream.readFully(fileStart);
      if (LOG.isDebugEnabled()) {
        LOG.error("file start: " + new String(fileStart, Charset.forName("ASCII")));
      }
      if (!Arrays.equals(fileStart, PARQUET_MAGIC)) {
        return false;
      }
      long fileEndIndex = fileLength - PARQUET_MAGIC.length;
      fsDataInputStream.seek(fileEndIndex);
      byte[] fileEnd = new byte[PARQUET_MAGIC.length];
      fsDataInputStream.readFully(fileEnd);
      if (LOG.isDebugEnabled()) {
        LOG.error("file end: " + new String(fileEnd, Charset.forName("ASCII")));
      }
      return Arrays.equals(fileEnd, PARQUET_MAGIC);
    } catch (IOException e) {
      return false;
    }
  }
  private void extractRow(LinkConfiguration linkConfiguration, FromJobConfiguration fromJobConfiguration, Text line) throws UnsupportedEncodingException {
    if (schema instanceof ByteArraySchema) {
      dataWriter.writeArrayRecord(new Object[] {line.toString().getBytes(SqoopIDFUtils.BYTE_FIELD_CHARSET)});
--- a/connector/connector-hdfs/src/main/java/org/apache/sqoop/connector/hdfs/HdfsLoader.java
+++ b/connector/connector-hdfs/src/main/java/org/apache/sqoop/connector/hdfs/HdfsLoader.java
@ -32,6 +32,7 @@
 import org.apache.sqoop.connector.hdfs.configuration.ToFormat;
 import org.apache.sqoop.connector.hdfs.configuration.ToJobConfiguration;
 import org.apache.sqoop.connector.hdfs.hdfsWriter.GenericHdfsWriter;
 import org.apache.sqoop.connector.hdfs.hdfsWriter.HdfsParquetWriter;
 import org.apache.sqoop.connector.hdfs.hdfsWriter.HdfsSequenceWriter;
 import org.apache.sqoop.connector.hdfs.hdfsWriter.HdfsTextWriter;
 import org.apache.sqoop.error.code.HdfsConnectorError;
@ -89,7 +90,7 @@ public Void run() throws Exception {
          GenericHdfsWriter filewriter = getWriter(toJobConfig);
-          filewriter.initialize(filepath, conf, codec);
+          filewriter.initialize(filepath, context.getSchema(), conf, codec);
      if (!HdfsUtils.hasCustomFormat(linkConfiguration, toJobConfig) || (context.getSchema() instanceof ByteArraySchema)) {
        String record;
@ -119,8 +120,14 @@ public Void run() throws Exception {
  }
  private GenericHdfsWriter getWriter(ToJobConfiguration toJobConf) {
-    return (toJobConf.toJobConfig.outputFormat == ToFormat.SEQUENCE_FILE) ? new HdfsSequenceWriter()
+    switch(toJobConf.toJobConfig.outputFormat) {
-        : new HdfsTextWriter();
+      case SEQUENCE_FILE:
        return new HdfsSequenceWriter();
      case PARQUET_FILE:
        return new HdfsParquetWriter();
      default:
        return new HdfsTextWriter();
    }
  }
  private String getCompressionCodecName(ToJobConfiguration toJobConf) {
@ -151,11 +158,16 @@ private String getCompressionCodecName(ToJobConfiguration toJobConf) {
  //TODO: We should probably support configurable extensions at some point
  private static String getExtension(ToJobConfiguration toJobConf, CompressionCodec codec) {
-    if (toJobConf.toJobConfig.outputFormat == ToFormat.SEQUENCE_FILE)
+    switch(toJobConf.toJobConfig.outputFormat) {
-      return ".seq";
+      case SEQUENCE_FILE:
-    if (codec == null)
+        return ".seq";
-      return ".txt";
+      case PARQUET_FILE:
-    return codec.getDefaultExtension();
+        return ".parquet";
      default:
        if (codec == null)
          return ".txt";
        return codec.getDefaultExtension();
    }
  }
  /* (non-Javadoc)
--- a/connector/connector-hdfs/src/main/java/org/apache/sqoop/connector/hdfs/configuration/ToFormat.java
+++ b/connector/connector-hdfs/src/main/java/org/apache/sqoop/connector/hdfs/configuration/ToFormat.java
@ -30,4 +30,9 @@ public enum ToFormat {
   * Sequence file
   */
  SEQUENCE_FILE,
  /**
   * Parquet file
   */
  PARQUET_FILE,
 }
--- a/connector/connector-hdfs/src/main/java/org/apache/sqoop/connector/hdfs/hdfsWriter/GenericHdfsWriter.java
+++ b/connector/connector-hdfs/src/main/java/org/apache/sqoop/connector/hdfs/hdfsWriter/GenericHdfsWriter.java
@ -20,12 +20,13 @@
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.compress.CompressionCodec;
 import org.apache.sqoop.schema.Schema;
 import java.io.IOException;
 public abstract class GenericHdfsWriter {
-  public abstract void initialize(Path filepath, Configuration conf, CompressionCodec codec) throws IOException;
+  public abstract void initialize(Path filepath, Schema schema, Configuration conf, CompressionCodec codec) throws IOException;
  public abstract void write(String csv) throws IOException;
--- a/connector/connector-hdfs/src/main/java/org/apache/sqoop/connector/hdfs/hdfsWriter/HdfsParquetWriter.java
+++ b/connector/connector-hdfs/src/main/java/org/apache/sqoop/connector/hdfs/hdfsWriter/HdfsParquetWriter.java
@ -0,0 +1,66 @@
 /**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.sqoop.connector.hdfs.hdfsWriter;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.compress.CompressionCodec;
 import org.apache.parquet.avro.AvroParquetWriter;
 import org.apache.parquet.hadoop.ParquetWriter;
 import org.apache.parquet.hadoop.metadata.CompressionCodecName;
 import org.apache.sqoop.connector.idf.AVROIntermediateDataFormat;
 import org.apache.sqoop.schema.Schema;
 import java.io.IOException;
 public class HdfsParquetWriter extends GenericHdfsWriter {
  private ParquetWriter avroParquetWriter;
  private Schema sqoopSchema;
  private AVROIntermediateDataFormat avroIntermediateDataFormat;
  @Override
  public void initialize(Path filepath, Schema schema, Configuration conf, CompressionCodec hadoopCodec) throws IOException {
    sqoopSchema = schema;
    avroIntermediateDataFormat = new AVROIntermediateDataFormat(sqoopSchema);
    CompressionCodecName parquetCodecName;
    if (hadoopCodec == null) {
      parquetCodecName = CompressionCodecName.UNCOMPRESSED;
    } else {
      parquetCodecName = CompressionCodecName.fromCompressionCodec(hadoopCodec.getClass());
    }
    avroParquetWriter =
      AvroParquetWriter.builder(filepath)
        .withSchema(avroIntermediateDataFormat.getAvroSchema())
        .withCompressionCodec(parquetCodecName)
        .withConf(conf).build();
  }
  @Override
  public void write(String csv) throws IOException {
    avroParquetWriter.write(avroIntermediateDataFormat.toAVRO(csv));
  }
  @Override
  public void destroy() throws IOException {
    avroParquetWriter.close();
  }
 }
--- a/connector/connector-hdfs/src/main/java/org/apache/sqoop/connector/hdfs/hdfsWriter/HdfsSequenceWriter.java
+++ b/connector/connector-hdfs/src/main/java/org/apache/sqoop/connector/hdfs/hdfsWriter/HdfsSequenceWriter.java
@ -23,16 +23,17 @@
 import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.compress.CompressionCodec;
 import org.apache.sqoop.schema.Schema;
 import java.io.IOException;
-public class HdfsSequenceWriter  extends GenericHdfsWriter {
+public class HdfsSequenceWriter extends GenericHdfsWriter {
  private SequenceFile.Writer filewriter;
  private Text text;
  @SuppressWarnings("deprecation")
-  public void initialize(Path filepath, Configuration conf, CompressionCodec codec) throws IOException {
+  public void initialize(Path filepath, Schema schema, Configuration conf, CompressionCodec codec) throws IOException {
    if (codec != null) {
      filewriter = SequenceFile.createWriter(filepath.getFileSystem(conf),
              conf, filepath, Text.class, NullWritable.class,
--- a/connector/connector-hdfs/src/main/java/org/apache/sqoop/connector/hdfs/hdfsWriter/HdfsTextWriter.java
+++ b/connector/connector-hdfs/src/main/java/org/apache/sqoop/connector/hdfs/hdfsWriter/HdfsTextWriter.java
@ -23,6 +23,7 @@
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.compress.CompressionCodec;
 import org.apache.sqoop.connector.hdfs.HdfsConstants;
 import org.apache.sqoop.schema.Schema;
 import java.io.BufferedWriter;
 import java.io.DataOutputStream;
@ -34,7 +35,7 @@ public class HdfsTextWriter extends GenericHdfsWriter {
  private BufferedWriter filewriter;
  @Override
-  public void initialize(Path filepath, Configuration conf, CompressionCodec codec) throws IOException {
+  public void initialize(Path filepath, Schema schema, Configuration conf, CompressionCodec codec) throws IOException {
    FileSystem fs = filepath.getFileSystem(conf);
    DataOutputStream filestream = fs.create(filepath, false);
--- a/connector/connector-hdfs/src/test/java/org/apache/sqoop/connector/hdfs/TestLoader.java
+++ b/connector/connector-hdfs/src/test/java/org/apache/sqoop/connector/hdfs/TestLoader.java
@ -17,9 +17,6 @@
 */
 package org.apache.sqoop.connector.hdfs;
 import static org.apache.sqoop.connector.hdfs.configuration.ToFormat.SEQUENCE_FILE;
 import static org.apache.sqoop.connector.hdfs.configuration.ToFormat.TEXT_FILE;
 import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.InputStreamReader;
@ -27,6 +24,7 @@
 import java.util.HashMap;
 import java.util.List;
 import org.apache.avro.generic.GenericRecord;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FSDataInputStream;
 import org.apache.hadoop.fs.FileStatus;
@ -35,11 +33,17 @@
 import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.compress.CompressionCodec;
 import org.apache.hadoop.io.compress.CompressionCodecFactory;
 import org.apache.parquet.avro.AvroParquetReader;
 import org.apache.parquet.format.converter.ParquetMetadataConverter;
 import org.apache.parquet.hadoop.ParquetFileReader;
 import org.apache.parquet.hadoop.ParquetReader;
 import org.apache.sqoop.common.MutableMapContext;
 import org.apache.sqoop.connector.common.SqoopIDFUtils;
 import org.apache.sqoop.connector.hdfs.configuration.LinkConfiguration;
 import org.apache.sqoop.connector.hdfs.configuration.ToCompression;
 import org.apache.sqoop.connector.hdfs.configuration.ToFormat;
 import org.apache.sqoop.connector.hdfs.configuration.ToJobConfiguration;
 import org.apache.sqoop.connector.idf.AVROIntermediateDataFormat;
 import org.apache.sqoop.etl.io.DataReader;
 import org.apache.sqoop.job.etl.Loader;
 import org.apache.sqoop.job.etl.LoaderContext;
@ -47,13 +51,18 @@
 import org.apache.sqoop.schema.type.FixedPoint;
 import org.apache.sqoop.schema.type.FloatingPoint;
 import org.apache.sqoop.schema.type.Text;
-import org.testng.annotations.AfterMethod;
+import org.apache.sqoop.utils.ClassUtils;
 import org.testng.Assert;
 import org.testng.annotations.AfterMethod;
 import org.testng.annotations.BeforeMethod;
 import org.testng.annotations.DataProvider;
 import org.testng.annotations.Factory;
 import org.testng.annotations.Test;
 import static org.apache.sqoop.connector.hdfs.configuration.ToFormat.PARQUET_FILE;
 import static org.apache.sqoop.connector.hdfs.configuration.ToFormat.SEQUENCE_FILE;
 import static org.apache.sqoop.connector.hdfs.configuration.ToFormat.TEXT_FILE;
 public class TestLoader extends TestHdfsBase {
  private static final String INPUT_ROOT = System.getProperty("maven.build.directory", "/tmp") + "/sqoop/warehouse/";
  private static final int NUMBER_OF_ROWS_PER_FILE = 1000;
@ -63,6 +72,7 @@ public class TestLoader extends TestHdfsBase {
  private final String outputDirectory;
  private Loader loader;
  private String user = "test_user";
  private Schema schema;
  @Factory(dataProvider="test-hdfs-loader")
  public TestLoader(ToFormat outputFormat,
@ -80,9 +90,10 @@ public static Object[][] data() {
    for (ToCompression compression : new ToCompression[]{
        ToCompression.DEFAULT,
        ToCompression.BZIP2,
        ToCompression.GZIP,
        ToCompression.NONE
    }) {
-      for (Object outputFileType : new Object[]{TEXT_FILE, SEQUENCE_FILE}) {
+      for (Object outputFileType : new Object[]{TEXT_FILE, SEQUENCE_FILE, PARQUET_FILE}) {
        parameters.add(new Object[]{outputFileType, compression});
      }
    }
@ -100,7 +111,7 @@ public void tearDown() throws IOException {
  @Test
  public void testLoader() throws Exception {
    FileSystem fs = FileSystem.get(new Configuration());
-    Schema schema = new Schema("schema").addColumn(new FixedPoint("col1", 8L, true))
+    schema = new Schema("schema").addColumn(new FixedPoint("col1", 8L, true))
        .addColumn(new FloatingPoint("col2", 4L))
        .addColumn(new Text("col3"));
@ -130,14 +141,22 @@ public Object readContent() {
        assertTestUser(user);
        return null;
      }
-    }, null, user);
+    }, schema, user);
    LinkConfiguration linkConf = new LinkConfiguration();
    ToJobConfiguration jobConf = new ToJobConfiguration();
    jobConf.toJobConfig.compression = compression;
    jobConf.toJobConfig.outputFormat = outputFormat;
    Path outputPath = new Path(outputDirectory);
-    loader.load(context, linkConf, jobConf);
+    try {
      loader.load(context, linkConf, jobConf);
    } catch (Exception e) {
      // we may wait to fail if the compression format selected is not supported by the
      // output format
      Assert.assertTrue(compressionNotSupported());
      return;
    }
    Assert.assertEquals(1, fs.listStatus(outputPath).length);
    for (FileStatus status : fs.listStatus(outputPath)) {
@ -152,10 +171,26 @@ public Object readContent() {
    Assert.assertEquals(5, fs.listStatus(outputPath).length);
  }
  private boolean compressionNotSupported() {
    switch (outputFormat) {
      case SEQUENCE_FILE:
        return compression == ToCompression.GZIP;
      case PARQUET_FILE:
        return compression == ToCompression.BZIP2 || compression == ToCompression.DEFAULT;
    }
    return false;
  }
  @Test
  public void testOverrideNull() throws Exception {
    // Parquet supports an actual "null" value so overriding null would not make
    // sense here
    if (outputFormat == PARQUET_FILE) {
      return;
    }
    FileSystem fs = FileSystem.get(new Configuration());
-    Schema schema = new Schema("schema").addColumn(new FixedPoint("col1", 8L, true))
+    schema = new Schema("schema").addColumn(new FixedPoint("col1", 8L, true))
        .addColumn(new FloatingPoint("col2", 8L))
        .addColumn(new Text("col3"))
        .addColumn(new Text("col4"));
@ -199,7 +234,15 @@ public Object readContent() {
    jobConf.toJobConfig.nullValue = "\\N";
    Path outputPath = new Path(outputDirectory);
-    loader.load(context, linkConf, jobConf);
+    try {
      loader.load(context, linkConf, jobConf);
    } catch (Exception e) {
      // we may wait to fail if the compression format selected is not supported by the
      // output format
      assert(compressionNotSupported());
      return;
    }
    Assert.assertEquals(1, fs.listStatus(outputPath).length);
    for (FileStatus status : fs.listStatus(outputPath)) {
@ -214,7 +257,7 @@ public Object readContent() {
    Assert.assertEquals(5, fs.listStatus(outputPath).length);
  }
-  private void verifyOutput(FileSystem fs, Path file, String format) throws IOException {
+  private void verifyOutput(FileSystem fs, Path file, String format) throws Exception {
    Configuration conf = new Configuration();
    FSDataInputStream fsin = fs.open(file);
    CompressionCodec codec;
@ -228,7 +271,9 @@ private void verifyOutput(FileSystem fs, Path file, String format) throws IOExce
          case BZIP2:
            Assert.assertTrue(codec.getClass().getCanonicalName().indexOf("BZip2") != -1);
            break;
-
+          case GZIP:
            Assert.assertTrue(codec.getClass().getCanonicalName().indexOf("Gzip") != -1);
            break;
          case DEFAULT:
            if(org.apache.hadoop.util.VersionInfo.getVersion().matches("\\b1\\.\\d\\.\\d")) {
              Assert.assertTrue(codec.getClass().getCanonicalName().indexOf("Default") != -1);
@ -282,11 +327,47 @@ private void verifyOutput(FileSystem fs, Path file, String format) throws IOExce
          Assert.assertEquals(line.toString(), formatRow(format, index++));
          line = new org.apache.hadoop.io.Text();
        }
        break;
      case PARQUET_FILE:
        String compressionCodecClassName = ParquetFileReader.readFooter(conf, file,  ParquetMetadataConverter.NO_FILTER).getBlocks().get(0).getColumns().get(0).getCodec().getHadoopCompressionCodecClassName();
        if (compressionCodecClassName == null) {
          codec = null;
        } else {
          codec = (CompressionCodec) ClassUtils.loadClass(compressionCodecClassName).newInstance();
        }
        // Verify compression
        switch(compression) {
          case GZIP:
            Assert.assertTrue(codec.getClass().getCanonicalName().indexOf("Gzip") != -1);
            break;
          case NONE:
          default:
            Assert.assertNull(codec);
            break;
        }
        ParquetReader<GenericRecord> avroParquetReader = AvroParquetReader.builder(file).build();
        AVROIntermediateDataFormat avroIntermediateDataFormat = new AVROIntermediateDataFormat();
        avroIntermediateDataFormat.setSchema(schema);
        GenericRecord record;
        index = 1;
        while ((record = avroParquetReader.read()) != null) {
          List<Object> objects = new ArrayList<>();
          for (int i = 0; i < record.getSchema().getFields().size(); i++) {
            objects.add(record.get(i));
          }
          Assert.assertEquals(SqoopIDFUtils.toText(avroIntermediateDataFormat.toCSV(record)), formatRow(format, index++));
        }
        break;
    }
  }
-  private void verifyOutput(FileSystem fs, Path file) throws IOException {
+  private void verifyOutput(FileSystem fs, Path file) throws Exception {
    verifyOutput(fs, file, "%d,%f,%s");
  }
 }
--- a/connector/connector-sdk/src/main/java/org/apache/sqoop/connector/common/SqoopAvroUtils.java
+++ b/connector/connector-sdk/src/main/java/org/apache/sqoop/connector/common/SqoopAvroUtils.java
@ -43,7 +43,8 @@ public class SqoopAvroUtils {
   * Creates an Avro schema from a Sqoop schema.
   */
  public static Schema createAvroSchema(org.apache.sqoop.schema.Schema sqoopSchema) {
-    String name = sqoopSchema.getName();
+    // avro schema names cannot start with quotes, lets just remove them
    String name = sqoopSchema.getName().replace("\"", "");
    String doc = sqoopSchema.getNote();
    String namespace = SQOOP_SCHEMA_NAMESPACE;
    Schema schema = Schema.createRecord(name, doc, namespace, false);
--- a/connector/connector-sdk/src/main/java/org/apache/sqoop/connector/idf/AVROIntermediateDataFormat.java
+++ b/connector/connector-sdk/src/main/java/org/apache/sqoop/connector/idf/AVROIntermediateDataFormat.java
@ -148,7 +148,7 @@ public Set<String> getJars() {
    return jars;
  }
-  private GenericRecord toAVRO(String csv) {
+  public GenericRecord toAVRO(String csv) {
    String[] csvStringArray = parseCSVString(csv);
@ -175,7 +175,7 @@ private GenericRecord toAVRO(String csv) {
    return avroObject;
  }
-  private Object toAVRO(String csvString, Column column) {
+  public Object toAVRO(String csvString, Column column) {
    Object returnValue = null;
    switch (column.getType()) {
@ -232,7 +232,7 @@ private Object toAVRO(String csvString, Column column) {
    return returnValue;
  }
-  private GenericRecord toAVRO(Object[] objectArray) {
+  public GenericRecord toAVRO(Object[] objectArray) {
    if (objectArray == null) {
      return null;
@ -311,7 +311,7 @@ private GenericRecord toAVRO(Object[] objectArray) {
  }
  @SuppressWarnings("unchecked")
-  private String toCSV(GenericRecord record) {
+  public String toCSV(GenericRecord record) {
    Column[] columns = this.schema.getColumnsArray();
    StringBuilder csvString = new StringBuilder();
@ -387,7 +387,7 @@ private String toCSV(GenericRecord record) {
  }
  @SuppressWarnings("unchecked")
-  private Object[] toObject(GenericRecord record) {
+  public Object[] toObject(GenericRecord record) {
    if (record == null) {
      return null;
@ -459,4 +459,8 @@ private Object[] toObject(GenericRecord record) {
    }
    return object;
  }
  public Schema getAvroSchema() {
    return avroSchema;
  }
 }
--- a/pom.xml
+++ b/pom.xml
@ -124,6 +124,7 @@ limitations under the License.
    <groovy.version>2.4.0</groovy.version>
    <jansi.version>1.7</jansi.version>
    <felix.version>2.4.0</felix.version>
    <parquet.version>1.8.1</parquet.version>
    <!-- maven plugin versions -->
    <maven-assembly-plugin.version>2.6</maven-assembly-plugin.version>
  </properties>
@ -700,6 +701,16 @@ limitations under the License.
        <artifactId>jetty-servlet</artifactId>
        <version>${jetty.version}</version>
      </dependency>
      <dependency>
        <groupId>org.apache.parquet</groupId>
        <artifactId>parquet-hadoop</artifactId>
        <version>${parquet.version}</version>
      </dependency>
      <dependency>
        <groupId>org.apache.parquet</groupId>
        <artifactId>parquet-avro</artifactId>
        <version>${parquet.version}</version>
      </dependency>
    </dependencies>
  </dependencyManagement>
--- a/test/pom.xml
+++ b/test/pom.xml
@ -175,6 +175,16 @@ limitations under the License.
      <artifactId>hadoop-common</artifactId>
    </dependency>
    <dependency>
      <groupId>org.apache.parquet</groupId>
      <artifactId>parquet-hadoop</artifactId>
    </dependency>
    <dependency>
      <groupId>org.apache.parquet</groupId>
      <artifactId>parquet-avro</artifactId>
    </dependency>
  </dependencies>
  <!-- Add classifier name to the JAR name -->
--- a/test/src/test/java/org/apache/sqoop/integration/connector/hdfs/NullValueTest.java
+++ b/test/src/test/java/org/apache/sqoop/integration/connector/hdfs/NullValueTest.java
@ -20,17 +20,27 @@
 import com.google.common.collect.HashMultiset;
 import com.google.common.collect.Iterables;
 import com.google.common.collect.Multiset;
 import org.apache.avro.generic.GenericRecord;
 import org.apache.commons.lang.StringUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.NullWritable;
 import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.Text;
 import org.apache.log4j.Logger;
 import org.apache.parquet.avro.AvroParquetReader;
 import org.apache.parquet.hadoop.ParquetReader;
 import org.apache.sqoop.connector.common.SqoopIDFUtils;
 import org.apache.sqoop.connector.hdfs.configuration.ToFormat;
 import org.apache.sqoop.connector.hdfs.hdfsWriter.HdfsParquetWriter;
 import org.apache.sqoop.connector.idf.AVROIntermediateDataFormat;
 import org.apache.sqoop.model.MDriverConfig;
 import org.apache.sqoop.model.MJob;
 import org.apache.sqoop.model.MLink;
 import org.apache.sqoop.schema.Schema;
 import org.apache.sqoop.schema.type.DateTime;
 import org.apache.sqoop.schema.type.FixedPoint;
 import org.apache.sqoop.test.asserts.HdfsAsserts;
 import org.apache.sqoop.test.infrastructure.Infrastructure;
 import org.apache.sqoop.test.infrastructure.SqoopTestCase;
@ -51,6 +61,7 @@
 import java.sql.Timestamp;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.LinkedList;
 import java.util.List;
@Infrastructure(dependencies = {KdcInfrastructureProvider.class, HadoopInfrastructureProvider.class, SqoopInfrastructureProvider.class, DatabaseInfrastructureProvider.class})
@ -64,6 +75,9 @@ public class NullValueTest extends SqoopTestCase {
  // The custom nullValue to use (set to null if default)
  private String nullValue;
  private Schema sqoopSchema;
  @DataProvider(name="nul-value-test")
  public static Object[][] data(ITestContext context) {
    String customNullValue = "^&*custom!@";
@ -80,12 +94,19 @@ public NullValueTest(ToFormat format, String nullValue) {
  }
  @Override
  public String getTestName() {
    return methodName + "[" + format.name() + ", " + nullValue + "]";
  }
  @BeforeMethod
  public void setup() throws Exception {
    sqoopSchema = new Schema("cities");
    sqoopSchema.addColumn(new FixedPoint("id", Long.valueOf(Integer.SIZE), true));
    sqoopSchema.addColumn(new org.apache.sqoop.schema.type.Text("country"));
    sqoopSchema.addColumn(new DateTime("some_date", true, false));
    sqoopSchema.addColumn(new org.apache.sqoop.schema.type.Text("city"));
    createTableCities();
  }
@ -128,6 +149,27 @@ public void testFromHdfs() throws Exception {
        }
        sequenceFileWriter.close();
        break;
      case PARQUET_FILE:
        // Parquet file format does not support using custom null values
        if (usingCustomNullValue()) {
          return;
        } else {
          HdfsParquetWriter parquetWriter = new HdfsParquetWriter();
          Configuration conf = new Configuration();
          FileSystem.setDefaultUri(conf, hdfsClient.getUri());
          parquetWriter.initialize(
            new Path(HdfsUtils.joinPathFragments(getMapreduceDirectory(), "input-0001.parquet")),
            sqoopSchema, conf, null);
          for (String line : getCsv()) {
            parquetWriter.write(line);
          }
          parquetWriter.destroy();
          break;
        }
      default:
        Assert.fail();
    }
@ -166,6 +208,11 @@ public void testFromHdfs() throws Exception {
  @Test
  public void testToHdfs() throws Exception {
    // Parquet file format does not support using custom null values
    if (usingCustomNullValue() && format == ToFormat.PARQUET_FILE) {
      return;
    }
    provider.insertRow(getTableName(), 1, "USA", Timestamp.valueOf("2004-10-23 00:00:00.000"), "San Francisco");
    provider.insertRow(getTableName(), 2, "USA", Timestamp.valueOf("2004-10-24 00:00:00.000"), (String) null);
    provider.insertRow(getTableName(), 3, (String) null, Timestamp.valueOf("2004-10-25 00:00:00.000"), "Brno");
@ -203,16 +250,16 @@ public void testToHdfs() throws Exception {
    executeJob(job);
    Multiset<String> setLines = HashMultiset.create(Arrays.asList(getCsv()));
    Path[] files = HdfsUtils.getOutputMapreduceFiles(hdfsClient, HdfsUtils.joinPathFragments(getMapreduceDirectory(), "TO"));
    List<String> notFound = new ArrayList<>();
    switch (format) {
      case TEXT_FILE:
        HdfsAsserts.assertMapreduceOutput(hdfsClient,
          HdfsUtils.joinPathFragments(getMapreduceDirectory(), "TO"), getCsv());
-        break;
+        return;
      case SEQUENCE_FILE:
        Multiset<String> setLines = HashMultiset.create(Arrays.asList(getCsv()));
        List<String> notFound = new ArrayList<>();
        Path[] files = HdfsUtils.getOutputMapreduceFiles(hdfsClient, HdfsUtils.joinPathFragments(getMapreduceDirectory(), "TO"));
        for(Path file : files) {
          SequenceFile.Reader.Option optPath = SequenceFile.Reader.file(file);
          SequenceFile.Reader sequenceFileReader = new SequenceFile.Reader(getHadoopConf(), optPath);
@ -224,17 +271,32 @@ public void testToHdfs() throws Exception {
            }
          }
        }
-        if(!setLines.isEmpty() || !notFound.isEmpty()) {
+        break;
-          LOG.error("Output do not match expectations.");
+      case PARQUET_FILE:
-          LOG.error("Expected lines that weren't present in the files:");
+        AVROIntermediateDataFormat avroIntermediateDataFormat = new AVROIntermediateDataFormat(sqoopSchema);
-          LOG.error("\t'" + StringUtils.join(setLines, "'\n\t'") + "'");
+        notFound = new LinkedList<>();
-          LOG.error("Extra lines in files that weren't expected:");
+        for (Path file : files) {
-          LOG.error("\t'" + StringUtils.join(notFound, "'\n\t'") + "'");
+          ParquetReader<GenericRecord> avroParquetReader = AvroParquetReader.builder(file).build();
-          Assert.fail("Output do not match expectations.");
+          GenericRecord record;
          while ((record = avroParquetReader.read()) != null) {
            String recordAsCsv = avroIntermediateDataFormat.toCSV(record);
            if (!setLines.remove(recordAsCsv)) {
              notFound.add(recordAsCsv);
            }
          }
        }
        break;
      default:
        Assert.fail();
    }
    if(!setLines.isEmpty() || !notFound.isEmpty()) {
      LOG.error("Output do not match expectations.");
      LOG.error("Expected lines that weren't present in the files:");
      LOG.error("\t'" + StringUtils.join(setLines, "'\n\t'") + "'");
      LOG.error("Extra lines in files that weren't expected:");
      LOG.error("\t'" + StringUtils.join(notFound, "'\n\t'") + "'");
      Assert.fail("Output do not match expectations.");
    }
  }
 }
--- a/test/src/test/java/org/apache/sqoop/integration/connector/hdfs/ParquetTest.java
+++ b/test/src/test/java/org/apache/sqoop/integration/connector/hdfs/ParquetTest.java
@ -0,0 +1,183 @@
 /**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.sqoop.integration.connector.hdfs;
 import com.google.common.collect.HashMultiset;
 import com.google.common.collect.Multiset;
 import org.apache.avro.generic.GenericRecord;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.parquet.avro.AvroParquetReader;
 import org.apache.parquet.hadoop.ParquetReader;
 import org.apache.sqoop.connector.hdfs.configuration.ToFormat;
 import org.apache.sqoop.connector.hdfs.hdfsWriter.HdfsParquetWriter;
 import org.apache.sqoop.model.MJob;
 import org.apache.sqoop.model.MLink;
 import org.apache.sqoop.schema.Schema;
 import org.apache.sqoop.schema.type.DateTime;
 import org.apache.sqoop.schema.type.FixedPoint;
 import org.apache.sqoop.schema.type.Text;
 import org.apache.sqoop.test.infrastructure.Infrastructure;
 import org.apache.sqoop.test.infrastructure.SqoopTestCase;
 import org.apache.sqoop.test.infrastructure.providers.DatabaseInfrastructureProvider;
 import org.apache.sqoop.test.infrastructure.providers.HadoopInfrastructureProvider;
 import org.apache.sqoop.test.infrastructure.providers.KdcInfrastructureProvider;
 import org.apache.sqoop.test.infrastructure.providers.SqoopInfrastructureProvider;
 import org.apache.sqoop.test.utils.HdfsUtils;
 import org.testng.annotations.AfterMethod;
 import org.testng.annotations.Test;
 import java.sql.Timestamp;
 import java.util.Arrays;
 import java.util.LinkedList;
 import java.util.List;
 import static org.testng.Assert.assertEquals;
 import static org.testng.Assert.fail;
@Infrastructure(dependencies = {KdcInfrastructureProvider.class, HadoopInfrastructureProvider.class, SqoopInfrastructureProvider.class, DatabaseInfrastructureProvider.class})
 public class ParquetTest extends SqoopTestCase {
  @AfterMethod
  public void dropTable() {
    super.dropTable();
  }
  @Test
  public void toParquetTest() throws Exception {
    createAndLoadTableCities();
    // RDBMS link
    MLink rdbmsConnection = getClient().createLink("generic-jdbc-connector");
    fillRdbmsLinkConfig(rdbmsConnection);
    saveLink(rdbmsConnection);
    // HDFS link
    MLink hdfsConnection = getClient().createLink("hdfs-connector");
    fillHdfsLink(hdfsConnection);
    saveLink(hdfsConnection);
    // Job creation
    MJob job = getClient().createJob(rdbmsConnection.getName(), hdfsConnection.getName());
    // Set rdbms "FROM" config
    fillRdbmsFromConfig(job, "id");
    // Fill the hdfs "TO" config
    fillHdfsToConfig(job, ToFormat.PARQUET_FILE);
    saveJob(job);
    executeJob(job);
    String[] expectedOutput =
      {"'1','USA','2004-10-23 00:00:00.000','San Francisco'",
        "'2','USA','2004-10-24 00:00:00.000','Sunnyvale'",
        "'3','Czech Republic','2004-10-25 00:00:00.000','Brno'",
        "'4','USA','2004-10-26 00:00:00.000','Palo Alto'"};
    Multiset<String> setLines = HashMultiset.create(Arrays.asList(expectedOutput));
    List<String> notFound = new LinkedList<>();
    Path[] files = HdfsUtils.getOutputMapreduceFiles(hdfsClient, getMapreduceDirectory());
    for (Path file : files) {
      ParquetReader<GenericRecord> avroParquetReader = AvroParquetReader.builder(file).build();
      GenericRecord record;
      while ((record = avroParquetReader.read()) != null) {
        String recordAsLine = recordToLine(record);
        if (!setLines.remove(recordAsLine)) {
          notFound.add(recordAsLine);
        }
      }
    }
    if (!setLines.isEmpty() || !notFound.isEmpty()) {
      fail("Output do not match expectations.");
    }
  }
  @Test
  public void fromParquetTest() throws Exception {
    createTableCities();
    Schema sqoopSchema = new Schema("cities");
    sqoopSchema.addColumn(new FixedPoint("id", Long.valueOf(Integer.SIZE), true));
    sqoopSchema.addColumn(new Text("country"));
    sqoopSchema.addColumn(new DateTime("some_date", true, false));
    sqoopSchema.addColumn(new Text("city"));
    HdfsParquetWriter parquetWriter = new HdfsParquetWriter();
    Configuration conf = new Configuration();
    FileSystem.setDefaultUri(conf, hdfsClient.getUri());
    parquetWriter.initialize(
      new Path(HdfsUtils.joinPathFragments(getMapreduceDirectory(), "input-0001.parquet")),
      sqoopSchema, conf, null);
    parquetWriter.write("1,'USA','2004-10-23 00:00:00.000','San Francisco'");
    parquetWriter.write("2,'USA','2004-10-24 00:00:00.000','Sunnyvale'");
    parquetWriter.destroy();
    parquetWriter.initialize(
      new Path(HdfsUtils.joinPathFragments(getMapreduceDirectory(), "input-0002.parquet")),
      sqoopSchema, conf, null);
    parquetWriter.write("3,'Czech Republic','2004-10-25 00:00:00.000','Brno'");
    parquetWriter.write("4,'USA','2004-10-26 00:00:00.000','Palo Alto'");
    parquetWriter.destroy();
    // RDBMS link
    MLink rdbmsLink = getClient().createLink("generic-jdbc-connector");
    fillRdbmsLinkConfig(rdbmsLink);
    saveLink(rdbmsLink);
    // HDFS link
    MLink hdfsLink = getClient().createLink("hdfs-connector");
    fillHdfsLink(hdfsLink);
    saveLink(hdfsLink);
    // Job creation
    MJob job = getClient().createJob(hdfsLink.getName(), rdbmsLink.getName());
    fillHdfsFromConfig(job);
    fillRdbmsToConfig(job);
    saveJob(job);
    executeJob(job);
    assertEquals(provider.rowCount(getTableName()), 4);
    assertRowInCities(1, "USA", Timestamp.valueOf("2004-10-23 00:00:00.000"), "San Francisco");
    assertRowInCities(2, "USA", Timestamp.valueOf("2004-10-24 00:00:00.000"), "Sunnyvale");
    assertRowInCities(3, "Czech Republic", Timestamp.valueOf("2004-10-25 00:00:00.000"), "Brno");
    assertRowInCities(4, "USA", Timestamp.valueOf("2004-10-26 00:00:00.000"), "Palo Alto");
  }
  public String recordToLine(GenericRecord genericRecord) {
    String line = "";
    line += "\'" + String.valueOf(genericRecord.get(0)) + "\',";
    line += "\'" + String.valueOf(genericRecord.get(1)) + "\',";
    line += "\'" + new Timestamp((Long)genericRecord.get(2)) + "00\',";
    line += "\'" + String.valueOf(genericRecord.get(3)) + "\'";
    return line;
  }
 }