From 1eb422623030ef8e2452950f37987c4b3c7765a7 Mon Sep 17 00:00:00 2001 From: Andrew Bayer Date: Fri, 22 Jul 2011 20:03:46 +0000 Subject: [PATCH] SIP-3. File format for large object (LOB) storage. Introduce LobFile format for storing large objects. Implemented LobFile.Reader, LobFile.Writer classes. Added a performance test of LobFile reading/writing speed. Build system: fix cobertura build deps. Remove unused utility classes from o.a.h.s.io. Use LobFile for external storage in {B,C}lobRef. Added LobReaderCache. Converted BlobRef to read from LobFiles (through LobReaderCache). LargeObjectLoader writes to LobFiles. Common code from BlobRef and ClobRef factored out into LobRef abstract base class. Updated Test{B,C}lobRef and TestLargeObjectLoader for new external LOB storage. Updated *ImportMappers to close LargeObjectLoaders when they're done. Added performance tests to build. Added script to run perf tests; factored out common logic into config script. Fixed ivy dependency resolution to use multiple configuration inheritance. Added LobFileStressTest. Added readme with instructions to src/perftest directory. Added CodecMap that abstracts compression codec classes to names. From: Aaron Kimball git-svn-id: https://svn.apache.org/repos/asf/incubator/sqoop/trunk@1149897 13f79535-47bb-0310-9956-ffa450edef68 --- bin/configure-sqoop | 72 + bin/sqoop | 43 +- build.xml | 32 +- ivy.xml | 10 +- .../org/apache/hadoop/sqoop/io/CodecMap.java | 99 + .../hadoop/sqoop/io/CountingOutputStream.java | 74 - .../sqoop/io/FixedLengthInputStream.java | 90 + .../sqoop/io/HdfsSplitOutputStream.java | 154 -- .../org/apache/hadoop/sqoop/io/LobFile.java | 1802 +++++++++++++++++ .../hadoop/sqoop/io/LobReaderCache.java | 146 ++ .../sqoop/io/UnsupportedCodecException.java | 38 + .../org/apache/hadoop/sqoop/lib/BlobRef.java | 160 +- .../org/apache/hadoop/sqoop/lib/ClobRef.java | 158 +- .../hadoop/sqoop/lib/LargeObjectLoader.java | 145 +- .../org/apache/hadoop/sqoop/lib/LobRef.java | 295 +++ .../mapreduce/SequenceFileImportMapper.java | 19 +- .../sqoop/mapreduce/TextImportMapper.java | 18 +- src/perftest/LobFilePerfTest.java | 112 + src/perftest/LobFileStressTest.java | 391 ++++ src/perftest/README-perftest.txt | 40 + src/scripts/run-perftest.sh | 39 + .../org/apache/hadoop/sqoop/SmokeTests.java | 2 + .../apache/hadoop/sqoop/io/TestLobFile.java | 577 ++++++ .../apache/hadoop/sqoop/lib/TestBlobRef.java | 33 +- .../apache/hadoop/sqoop/lib/TestClobRef.java | 44 +- .../sqoop/lib/TestLargeObjectLoader.java | 4 +- 26 files changed, 4012 insertions(+), 585 deletions(-) create mode 100755 bin/configure-sqoop create mode 100644 src/java/org/apache/hadoop/sqoop/io/CodecMap.java delete mode 100644 src/java/org/apache/hadoop/sqoop/io/CountingOutputStream.java create mode 100644 src/java/org/apache/hadoop/sqoop/io/FixedLengthInputStream.java delete mode 100644 src/java/org/apache/hadoop/sqoop/io/HdfsSplitOutputStream.java create mode 100644 src/java/org/apache/hadoop/sqoop/io/LobFile.java create mode 100644 src/java/org/apache/hadoop/sqoop/io/LobReaderCache.java create mode 100644 src/java/org/apache/hadoop/sqoop/io/UnsupportedCodecException.java create mode 100644 src/java/org/apache/hadoop/sqoop/lib/LobRef.java create mode 100644 src/perftest/LobFilePerfTest.java create mode 100644 src/perftest/LobFileStressTest.java create mode 100644 src/perftest/README-perftest.txt create mode 100755 src/scripts/run-perftest.sh create mode 100644 src/test/org/apache/hadoop/sqoop/io/TestLobFile.java diff --git a/bin/configure-sqoop b/bin/configure-sqoop new file mode 100755 index 00000000..fef48e86 --- /dev/null +++ b/bin/configure-sqoop @@ -0,0 +1,72 @@ +#!/bin/sh +# +# Licensed to Cloudera, Inc. under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This is sourced in by bin/sqoop to set environment variables prior to +# invoking Hadoop. + +prgm=`readlink -f $0` +bin=`dirname ${prgm}` +bin=`cd ${bin} && pwd` + +if [ -z "$SQOOP_HOME" ]; then + export SQOOP_HOME=${bin}/.. +fi + +if [ -z "${HADOOP_HOME}" ]; then + # Try CDH default if the user hasn't set this. + HADOOP_HOME=/usr/lib/hadoop +fi + +# Where to find the main Sqoop jar +SQOOP_JAR_DIR=$SQOOP_HOME + +# Where to find the shim jars. +SQOOP_SHIM_DIR=$SQOOP_HOME/shims + +# If there's a "build" subdir, override with this, so we use +# the newly-compiled copy. +if [ -d "$SQOOP_JAR_DIR/build" ]; then + SQOOP_JAR_DIR="${SQOOP_JAR_DIR}/build" + + if [ -d "$SQOOP_JAR_DIR/shims" ]; then + SQOOP_SHIM_DIR="$SQOOP_JAR_DIR/shims" + fi +fi + +# Add sqoop dependencies to classpath. +SQOOP_CLASSPATH="" +if [ -d "$SQOOP_HOME/lib" ]; then + for f in $SQOOP_HOME/lib/*.jar; do + SQOOP_CLASSPATH=${SQOOP_CLASSPATH}:$f; + done +fi + +# If there's a build subdir, use Ivy-retrieved dependencies too. +if [ -d "$SQOOP_HOME/build/ivy/lib/sqoop" ]; then + for f in $SQOOP_HOME/build/ivy/lib/sqoop/*/*.jar; do + SQOOP_CLASSPATH=${SQOOP_CLASSPATH}:$f; + done +fi + +export SQOOP_CLASSPATH +export SQOOP_JAR_DIR +export SQOOP_SHIM_DIR +export SQOOP_JAR=`ls -1 ${SQOOP_JAR_DIR}/sqoop-*.jar | head -n 1` +export HADOOP_CLASSPATH="${SQOOP_CLASSPATH}:${HADOOP_CLASSPATH}" +export HADOOP_HOME +export HADOOP_OPTS="-Dsqoop.shim.jar.dir=${SQOOP_SHIM_DIR} ${HADOOP_OPTS}" + diff --git a/bin/sqoop b/bin/sqoop index 7bdf5b06..8a8f0118 100755 --- a/bin/sqoop +++ b/bin/sqoop @@ -19,43 +19,6 @@ prgm=`readlink -f $0` bin=`dirname ${prgm}` bin=`cd ${bin} && pwd` -if [ -z "$SQOOP_HOME" ]; then - SQOOP_HOME=${bin}/.. -fi - -if [ -z "${HADOOP_HOME}" ]; then - # Try CDH default if the user hasn't set this. - HADOOP_HOME=/usr/lib/hadoop -fi - -# Where to find the main Sqoop jar -SQOOP_JAR_DIR=$SQOOP_HOME - -# Where to find the shim jars. -SQOOP_SHIM_DIR=$SQOOP_HOME/shims - -# If there's a "build" subdir, override with this, so we use -# the newly-compiled copy. -if [ -d "$SQOOP_JAR_DIR/build" ]; then - SQOOP_JAR_DIR="${SQOOP_JAR_DIR}/build" - - if [ -d "$SQOOP_JAR_DIR/shims" ]; then - SQOOP_SHIM_DIR="$SQOOP_JAR_DIR/shims" - fi -fi - -# Add sqoop dependencies to classpath. -SQOOP_CLASSPATH="" -if [ -d "$SQOOP_HOME/lib" ]; then - for f in $SQOOP_HOME/lib/*.jar; do - SQOOP_CLASSPATH=${SQOOP_CLASSPATH}:$f; - done -fi - -SQOOP_JAR=`ls -1 ${SQOOP_JAR_DIR}/sqoop-*.jar | head -n 1` - -# Invoke Hadoop -HADOOP_CLASSPATH="${SQOOP_CLASSPATH}:${HADOOP_CLASSPATH}" \ - HADOOP_OPTS="-Dsqoop.shim.jar.dir=${SQOOP_SHIM_DIR} ${HADOOP_OPTS}" \ - ${HADOOP_HOME}/bin/hadoop jar ${SQOOP_JAR} org.apache.hadoop.sqoop.Sqoop "$@" - +source ${bin}/configure-sqoop +${HADOOP_HOME}/bin/hadoop jar ${SQOOP_JAR} \ + org.apache.hadoop.sqoop.Sqoop "$@" diff --git a/build.xml b/build.xml index 945b2892..61e8aa53 100644 --- a/build.xml +++ b/build.xml @@ -39,6 +39,7 @@ + @@ -50,6 +51,8 @@ + + @@ -123,7 +126,6 @@ - @@ -131,7 +133,6 @@ - @@ -146,7 +147,7 @@ + depends="init, ivy-retrieve-hadoop"> @@ -196,7 +197,7 @@ + + + + + + + + + - diff --git a/ivy.xml b/ivy.xml index 0c657f71..a10d1f1e 100644 --- a/ivy.xml +++ b/ivy.xml @@ -33,18 +33,18 @@ extends="runtime" description="artifacts needed to compile/test the application"/> - +