mirror of
https://github.com/apache/sqoop.git
synced 2025-05-04 00:43:42 +08:00
Rewrite documentation for updated Sqoop API
Add documentation for all SqoopTool implementations. Add database compatibility notes. Separate user guide from the develpment guide. From: Aaron Kimball <aaron@cloudera.com> git-svn-id: https://svn.apache.org/repos/asf/incubator/sqoop/trunk@1149902 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
15f5375d76
commit
faabc51a90
@ -16,28 +16,40 @@
|
|||||||
BUILDROOT=../../build
|
BUILDROOT=../../build
|
||||||
BUILD_DIR=$(BUILDROOT)/docs
|
BUILD_DIR=$(BUILDROOT)/docs
|
||||||
|
|
||||||
all: man userguide
|
all: man userguide devguide
|
||||||
|
|
||||||
man: $(BUILD_DIR)/sqoop.1.gz
|
man: $(BUILD_DIR)/sqoop.1.gz
|
||||||
|
|
||||||
userguide: $(BUILD_DIR)/SqoopUserGuide.html
|
userguide: $(BUILD_DIR)/SqoopUserGuide.html
|
||||||
|
|
||||||
$(BUILD_DIR)/sqoop.1.gz: Sqoop-manpage.txt *formatting*.txt
|
devguide: $(BUILD_DIR)/SqoopDevGuide.html
|
||||||
asciidoc -b docbook -d manpage Sqoop-manpage.txt
|
|
||||||
|
$(BUILD_DIR)/sqoop.1.gz: user/Sqoop-manpage.txt user/*formatting*.txt
|
||||||
|
asciidoc -b docbook -d manpage user/Sqoop-manpage.txt
|
||||||
xmlto man Sqoop-manpage.xml
|
xmlto man Sqoop-manpage.xml
|
||||||
gzip sqoop.1
|
gzip sqoop.1
|
||||||
rm Sqoop-manpage.xml
|
rm Sqoop-manpage.xml
|
||||||
mkdir -p $(BUILD_DIR)
|
mkdir -p $(BUILD_DIR)
|
||||||
mv sqoop.1.gz $(BUILD_DIR)
|
mv sqoop.1.gz $(BUILD_DIR)
|
||||||
|
|
||||||
$(BUILD_DIR)/SqoopUserGuide.html: SqoopUserGuide.txt *.txt
|
$(BUILD_DIR)/SqoopUserGuide.html: user/*.txt
|
||||||
asciidoc SqoopUserGuide.txt
|
asciidoc -a toc -a toclevels=1 -a "toc-title=Table of Contents" \
|
||||||
|
user/SqoopUserGuide.txt
|
||||||
mkdir -p $(BUILD_DIR)
|
mkdir -p $(BUILD_DIR)
|
||||||
mv SqoopUserGuide.html $(BUILD_DIR)
|
mv user/SqoopUserGuide.html $(BUILD_DIR)
|
||||||
|
|
||||||
|
$(BUILD_DIR)/SqoopDevGuide.html: dev/*.txt
|
||||||
|
asciidoc -a toc -a toclevels=1 -a "toc-title=Table of Contents" \
|
||||||
|
dev/SqoopDevGuide.txt
|
||||||
|
mkdir -p $(BUILD_DIR)
|
||||||
|
mv dev/SqoopDevGuide.html $(BUILD_DIR)
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
-rm $(BUILD_DIR)/sqoop.1.gz
|
-rm $(BUILD_DIR)/sqoop.1.gz
|
||||||
-rm $(BUILD_DIR)/SqoopUserGuide.html
|
-rm $(BUILD_DIR)/SqoopUserGuide.html
|
||||||
|
-rm $(BUILD_DIR)/SqoopUserGuide.pdf
|
||||||
|
-rm user/SqoopUserGuide.html
|
||||||
|
-rm dev/SqoopDevGuide.html
|
||||||
|
|
||||||
.PHONY: all man userguide clean
|
.PHONY: all man userguide devguide clean
|
||||||
|
|
||||||
|
@ -1,67 +0,0 @@
|
|||||||
|
|
||||||
////
|
|
||||||
Licensed to Cloudera, Inc. under one or more
|
|
||||||
contributor license agreements. See the NOTICE file distributed with
|
|
||||||
this work for additional information regarding copyright ownership.
|
|
||||||
Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0
|
|
||||||
(the "License"); you may not use this file except in compliance with
|
|
||||||
the License. You may obtain a copy of the License at
|
|
||||||
|
|
||||||
http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
|
|
||||||
Unless required by applicable law or agreed to in writing, software
|
|
||||||
distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
See the License for the specific language governing permissions and
|
|
||||||
limitations under the License.
|
|
||||||
////
|
|
||||||
|
|
||||||
|
|
||||||
include::intro.txt[]
|
|
||||||
|
|
||||||
|
|
||||||
The Sqoop Command Line
|
|
||||||
----------------------
|
|
||||||
|
|
||||||
To execute Sqoop, run with Hadoop:
|
|
||||||
----
|
|
||||||
$ bin/hadoop jar contrib/sqoop/hadoop-$(version)-sqoop.jar (arguments)
|
|
||||||
----
|
|
||||||
|
|
||||||
NOTE:Throughput this document, we will use `sqoop` as shorthand for the
|
|
||||||
above. i.e., `$ sqoop (arguments)`
|
|
||||||
|
|
||||||
You pass this program options describing the
|
|
||||||
import job you want to perform. If you need a hint, running Sqoop with
|
|
||||||
`--help` will print out a list of all the command line
|
|
||||||
options available. The +sqoop(1)+ manual page will also describe
|
|
||||||
Sqoop's available arguments in greater detail. The manual page is built
|
|
||||||
in `$HADOOP_HOME/build/contrib/sqoop/doc/sqoop.1.gz`.
|
|
||||||
The following subsections will describe the most common modes of operation.
|
|
||||||
|
|
||||||
include::connecting.txt[]
|
|
||||||
|
|
||||||
include::listing-dbs.txt[]
|
|
||||||
|
|
||||||
include::listing-tables.txt[]
|
|
||||||
|
|
||||||
include::full-db-import.txt[]
|
|
||||||
|
|
||||||
include::table-import.txt[]
|
|
||||||
|
|
||||||
include::controlling-output-format.txt[]
|
|
||||||
|
|
||||||
include::classnames.txt[]
|
|
||||||
|
|
||||||
include::misc-args.txt[]
|
|
||||||
|
|
||||||
include::direct.txt[]
|
|
||||||
|
|
||||||
include::hive.txt[]
|
|
||||||
|
|
||||||
include::export.txt[]
|
|
||||||
|
|
||||||
include::supported-dbs.txt[]
|
|
||||||
|
|
||||||
include::api-reference.txt[]
|
|
||||||
|
|
@ -16,20 +16,13 @@
|
|||||||
limitations under the License.
|
limitations under the License.
|
||||||
////
|
////
|
||||||
|
|
||||||
|
include::intro.txt[]
|
||||||
|
|
||||||
Listing Available Databases
|
include::preface.txt[]
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
include::compiling.txt[]
|
||||||
|
|
||||||
|
include::api-reference.txt[]
|
||||||
|
|
||||||
Once connected to a database server, you can list the available
|
|
||||||
databases with the +--list-databases+ parameter. This currently is supported
|
|
||||||
only by HSQLDB and MySQL. Note that in this case, the connect string does
|
|
||||||
not include a database name, just a server address.
|
|
||||||
|
|
||||||
----
|
|
||||||
$ sqoop --connect jdbc:mysql://database.example.com/ --list-databases
|
|
||||||
information_schema
|
|
||||||
employees
|
|
||||||
----
|
|
||||||
_This only works with HSQLDB and MySQL. A vendor-agnostic implementation of
|
|
||||||
this function has not yet been implemented._
|
|
||||||
|
|
@ -19,29 +19,38 @@
|
|||||||
Developer API Reference
|
Developer API Reference
|
||||||
-----------------------
|
-----------------------
|
||||||
|
|
||||||
This section is intended to specify the APIs available to application writers
|
This section specifies the APIs available to application writers who
|
||||||
integrating with Sqoop, and those modifying Sqoop. The next three subsections
|
want to integrate with Sqoop, and those who want to modify Sqoop.
|
||||||
are written from the following three perspectives: those using classes generated
|
|
||||||
by Sqoop, and its public library; those writing Sqoop extensions (i.e.,
|
The next three subsections are written for the following use cases:
|
||||||
additional ConnManager implementations that interact with more databases); and
|
|
||||||
those modifying Sqoop's internals. Each section describes the system in
|
- Using classes generated by Sqoop and its public library
|
||||||
successively greater depth.
|
- Writing Sqoop extensions (that is, additional ConnManager implementations
|
||||||
|
that interact with more databases)
|
||||||
|
- Modifying Sqoop's internals
|
||||||
|
|
||||||
|
Each section describes the system in successively greater depth.
|
||||||
|
|
||||||
|
|
||||||
The External API
|
The External API
|
||||||
~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
Sqoop auto-generates classes that represent the tables imported into HDFS. The
|
Sqoop automatically generates classes that represent the tables
|
||||||
class contains member fields for each column of the imported table; an instance
|
imported into the Hadoop Distributed File System (HDFS). The class
|
||||||
of the class holds one row of the table. The generated classes implement the
|
contains member fields for each column of the imported table; an
|
||||||
serialization APIs used in Hadoop, namely the _Writable_ and _DBWritable_
|
instance of the class holds one row of the table. The generated
|
||||||
interfaces. They also contain other convenience methods: a +parse()+ method
|
classes implement the serialization APIs used in Hadoop, namely the
|
||||||
that interprets delimited text fields, and a +toString()+ method that preserves
|
_Writable_ and _DBWritable_ interfaces. They also contain these other
|
||||||
the user's chosen delimiters. The full set of methods guaranteed to exist in an
|
convenience methods:
|
||||||
auto-generated class are specified in the interface
|
|
||||||
|
- A parse() method that interprets delimited text fields
|
||||||
|
- A toString() method that preserves the user's chosen delimiters
|
||||||
|
|
||||||
|
The full set of methods guaranteed to exist in an auto-generated class
|
||||||
|
is specified in the abstract class
|
||||||
+org.apache.hadoop.sqoop.lib.SqoopRecord+.
|
+org.apache.hadoop.sqoop.lib.SqoopRecord+.
|
||||||
|
|
||||||
Instances of _SqoopRecord_ may depend on Sqoop's public API. This is all classes
|
Instances of +SqoopRecord+ may depend on Sqoop's public API. This is all classes
|
||||||
in the +org.apache.hadoop.sqoop.lib+ package. These are briefly described below.
|
in the +org.apache.hadoop.sqoop.lib+ package. These are briefly described below.
|
||||||
Clients of Sqoop should not need to directly interact with any of these classes,
|
Clients of Sqoop should not need to directly interact with any of these classes,
|
||||||
although classes generated by Sqoop will depend on them. Therefore, these APIs
|
although classes generated by Sqoop will depend on them. Therefore, these APIs
|
||||||
@ -57,16 +66,21 @@ are considered public and care will be taken when forward-evolving them.
|
|||||||
* +BigDecimalSerializer+ contains a pair of methods that facilitate
|
* +BigDecimalSerializer+ contains a pair of methods that facilitate
|
||||||
serialization of +BigDecimal+ objects over the _Writable_ interface.
|
serialization of +BigDecimal+ objects over the _Writable_ interface.
|
||||||
|
|
||||||
|
The full specification of the public API is available on the Sqoop
|
||||||
|
Development Wiki as
|
||||||
|
http://wiki.github.com/cloudera/sqoop/sip-4[SIP-4].
|
||||||
|
|
||||||
|
|
||||||
The Extension API
|
The Extension API
|
||||||
~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
This section covers the API and primary classes used by extensions for Sqoop
|
This section covers the API and primary classes used by extensions for Sqoop
|
||||||
which allow Sqoop to interface with more database vendors.
|
which allow Sqoop to interface with more database vendors.
|
||||||
|
|
||||||
While Sqoop uses JDBC and +DBInputFormat+ (and +DataDrivenDBInputFormat+) to
|
While Sqoop uses JDBC and +DataDrivenDBInputFormat+ to
|
||||||
read from databases, differences in the SQL supported by different vendors as
|
read from databases, differences in the SQL supported by different vendors as
|
||||||
well as JDBC metadata necessitates vendor-specific codepaths for most databases.
|
well as JDBC metadata necessitates vendor-specific codepaths for most databases.
|
||||||
Sqoop's solution to this problem is by introducing the ConnManager API
|
Sqoop's solution to this problem is by introducing the +ConnManager+ API
|
||||||
(+org.apache.hadoop.sqoop.manager.ConnMananger+).
|
(+org.apache.hadoop.sqoop.manager.ConnMananger+).
|
||||||
|
|
||||||
+ConnManager+ is an abstract class defining all methods that interact with the
|
+ConnManager+ is an abstract class defining all methods that interact with the
|
||||||
@ -80,40 +94,46 @@ selectively override behavior. For example, the +getColNamesQuery()+ method
|
|||||||
allows the SQL query used by +getColNames()+ to be modified without needing to
|
allows the SQL query used by +getColNames()+ to be modified without needing to
|
||||||
rewrite the majority of +getColNames()+.
|
rewrite the majority of +getColNames()+.
|
||||||
|
|
||||||
+ConnManager+ implementations receive a lot of their configuration data from a
|
+ConnManager+ implementations receive a lot of their configuration
|
||||||
Sqoop-specific class, +SqoopOptions+. While +SqoopOptions+ does not currently
|
data from a Sqoop-specific class, +SqoopOptions+. +SqoopOptions+ are
|
||||||
contain many setter methods, clients should not assume +SqoopOptions+ are
|
mutable. +SqoopOptions+ does not directly store specific per-manager
|
||||||
immutable. More setter methods may be added in the future. +SqoopOptions+ does
|
options. Instead, it contains a reference to the +Configuration+
|
||||||
not directly store specific per-manager options. Instead, it contains a
|
returned by +Tool.getConf()+ after parsing command-line arguments with
|
||||||
reference to the +Configuration+ returned by +Tool.getConf()+ after parsing
|
the +GenericOptionsParser+. This allows extension arguments via "+-D
|
||||||
command-line arguments with the +GenericOptionsParser+. This allows extension
|
any.specific.param=any.value+" without requiring any layering of
|
||||||
arguments via "+-D any.specific.param=any.value+" without requiring any layering
|
options parsing or modification of +SqoopOptions+. This
|
||||||
of options parsing or modification of +SqoopOptions+.
|
+Configuration+ forms the basis of the +Configuration+ passed to any
|
||||||
|
MapReduce +Job+ invoked in the workflow, so that users can set on the
|
||||||
|
command-line any necessary custom Hadoop state.
|
||||||
|
|
||||||
All existing +ConnManager+ implementations are stateless. Thus, the system which
|
All existing +ConnManager+ implementations are stateless. Thus, the
|
||||||
instantiates +ConnManagers+ may implement multiple instances of the same
|
system which instantiates +ConnManagers+ may implement multiple
|
||||||
+ConnMananger+ class over Sqoop's lifetime. If a caching layer is required, we
|
instances of the same +ConnMananger+ class over Sqoop's lifetime. It
|
||||||
can add one later, but it is not currently available.
|
is currently assumed that instantiating a +ConnManager+ is a
|
||||||
|
lightweight operation, and is done reasonably infrequently. Therefore,
|
||||||
|
+ConnManagers+ are not cached between operations, etc.
|
||||||
|
|
||||||
+ConnManagers+ are currently created by instances of the abstract class +ManagerFactory+ (See
|
+ConnManagers+ are currently created by instances of the abstract
|
||||||
MAPREDUCE-750). One +ManagerFactory+ implementation currently serves all of
|
class +ManagerFactory+ (See
|
||||||
Sqoop: +org.apache.hadoop.sqoop.manager.DefaultManagerFactory+. Extensions
|
http://issues.apache.org/jira/browse/MAPREDUCE-750[]). One
|
||||||
should not modify +DefaultManagerFactory+. Instead, an extension-specific
|
+ManagerFactory+ implementation currently serves all of Sqoop:
|
||||||
+ManagerFactory+ implementation should be provided with the new ConnManager.
|
+org.apache.hadoop.sqoop.manager.DefaultManagerFactory+. Extensions
|
||||||
+ManagerFactory+ has a single method of note, named +accept()+. This method will
|
should not modify +DefaultManagerFactory+. Instead, an
|
||||||
determine whether it can instantiate a +ConnManager+ for the user's
|
extension-specific +ManagerFactory+ implementation should be provided
|
||||||
+SqoopOptions+. If so, it returns the +ConnManager+ instance. Otherwise, it
|
with the new +ConnManager+. +ManagerFactory+ has a single method of
|
||||||
returns +null+.
|
note, named +accept()+. This method will determine whether it can
|
||||||
|
instantiate a +ConnManager+ for the user's +SqoopOptions+. If so, it
|
||||||
|
returns the +ConnManager+ instance. Otherwise, it returns +null+.
|
||||||
|
|
||||||
The +ManagerFactory+ implementations used are governed by the
|
The +ManagerFactory+ implementations used are governed by the
|
||||||
+sqoop.connection.factories+ setting in sqoop-site.xml. Users of extension
|
+sqoop.connection.factories+ setting in +sqoop-site.xml+. Users of extension
|
||||||
libraries can install the 3rd-party library containing a new +ManagerFactory+
|
libraries can install the 3rd-party library containing a new +ManagerFactory+
|
||||||
and +ConnManager+(s), and configure sqoop-site.xml to use the new
|
and +ConnManager+(s), and configure +sqoop-site.xml+ to use the new
|
||||||
+ManagerFactory+. The +DefaultManagerFactory+ principly discriminates between
|
+ManagerFactory+. The +DefaultManagerFactory+ principly discriminates between
|
||||||
databases by parsing the connect string stored in +SqoopOptions+.
|
databases by parsing the connect string stored in +SqoopOptions+.
|
||||||
|
|
||||||
Extension authors may make use of classes in the +org.apache.hadoop.sqoop.io+,
|
Extension authors may make use of classes in the +org.apache.hadoop.sqoop.io+,
|
||||||
+mapred+, +mapreduce+, and +util+ packages to facilitate their implementations.
|
+mapreduce+, and +util+ packages to facilitate their implementations.
|
||||||
These packages and classes are described in more detail in the following
|
These packages and classes are described in more detail in the following
|
||||||
section.
|
section.
|
||||||
|
|
||||||
@ -134,35 +154,43 @@ General program flow
|
|||||||
The general program flow is as follows:
|
The general program flow is as follows:
|
||||||
|
|
||||||
+org.apache.hadoop.sqoop.Sqoop+ is the main class and implements _Tool_. A new
|
+org.apache.hadoop.sqoop.Sqoop+ is the main class and implements _Tool_. A new
|
||||||
instance is launched with +ToolRunner+. It parses its arguments using the
|
instance is launched with +ToolRunner+. The first argument to Sqoop is
|
||||||
+SqoopOptions+ class. Within the +SqoopOptions+, an +ImportAction+ will be
|
a string identifying the name of a +SqoopTool+ to run. The +SqoopTool+
|
||||||
chosen by the user. This may be import all tables, import one specific table,
|
itself drives the execution of the user's requested operation (e.g.,
|
||||||
execute a SQL statement, or others.
|
import, export, codegen, etc).
|
||||||
|
|
||||||
A +ConnManager+ is then instantiated based on the data in the +SqoopOptions+.
|
The +SqoopTool+ API is specified fully in
|
||||||
The +ConnFactory+ is used to get a +ConnManager+ from a +ManagerFactory+; the
|
http://wiki.github.com/cloudera/sqoop/sip-1[SIP-1].
|
||||||
mechanics of this were described in an earlier section.
|
|
||||||
|
|
||||||
Then in the +run()+ method, using a case statement, it determines which actions
|
The chosen +SqoopTool+ will parse the remainder of the arguments,
|
||||||
the user needs performed based on the +ImportAction+ enum. Usually this involves
|
setting the appropriate fields in the +SqoopOptions+ class. It will
|
||||||
determining a list of tables to import, generating user code for them, and
|
then run its body.
|
||||||
running a MapReduce job per table to read the data. The import itself does not
|
|
||||||
specifically need to be run via a MapReduce job; the +ConnManager.importTable()+
|
|
||||||
method is left to determine how best to run the import. Each of these actions is
|
|
||||||
controlled by the +ConnMananger+, except for the generating of code, which is
|
|
||||||
done by the +CompilationManager+ and +ClassWriter+. (Both in the
|
|
||||||
+org.apache.hadoop.sqoop.orm+ package.) Importing into Hive is also taken care
|
|
||||||
of via the +org.apache.hadoop.sqoop.hive.HiveImport+ class after the
|
|
||||||
+importTable()+ has completed. This is done without concern for the
|
|
||||||
+ConnManager+ implementation used.
|
|
||||||
|
|
||||||
A ConnManager's +importTable()+ method receives a single argument of type
|
Then in the SqoopTool's +run()+ method, the import or export or other
|
||||||
+ImportJobContext+ which contains parameters to the method. This class may be
|
action proper is executed. Typically, a +ConnManager+ is then
|
||||||
extended with additional parameters in the future, which optionally further
|
instantiated based on the data in the +SqoopOptions+. The
|
||||||
direct the import operation. Similarly, the +exportTable()+ method receives an
|
+ConnFactory+ is used to get a +ConnManager+ from a +ManagerFactory+;
|
||||||
argument of type +ExportJobContext+. These classes contain the name of the table
|
the mechanics of this were described in an earlier section. Imports
|
||||||
to import/export, a reference to the +SqoopOptions+ object, and other related
|
and exports and other large data motion tasks typically run a
|
||||||
data.
|
MapReduce job to operate on a table in a parallel, reliable fashion.
|
||||||
|
An import does not specifically need to be run via a MapReduce job;
|
||||||
|
the +ConnManager.importTable()+ method is left to determine how best
|
||||||
|
to run the import. Each main action is actually controlled by the
|
||||||
|
+ConnMananger+, except for the generating of code, which is done by
|
||||||
|
the +CompilationManager+ and +ClassWriter+. (Both in the
|
||||||
|
+org.apache.hadoop.sqoop.orm+ package.) Importing into Hive is also
|
||||||
|
taken care of via the +org.apache.hadoop.sqoop.hive.HiveImport+ class
|
||||||
|
after the +importTable()+ has completed. This is done without concern
|
||||||
|
for the +ConnManager+ implementation used.
|
||||||
|
|
||||||
|
A ConnManager's +importTable()+ method receives a single argument of
|
||||||
|
type +ImportJobContext+ which contains parameters to the method. This
|
||||||
|
class may be extended with additional parameters in the future, which
|
||||||
|
optionally further direct the import operation. Similarly, the
|
||||||
|
+exportTable()+ method receives an argument of type
|
||||||
|
+ExportJobContext+. These classes contain the name of the table to
|
||||||
|
import/export, a reference to the +SqoopOptions+ object, and other
|
||||||
|
related data.
|
||||||
|
|
||||||
Subpackages
|
Subpackages
|
||||||
^^^^^^^^^^^
|
^^^^^^^^^^^
|
||||||
@ -175,8 +203,9 @@ The following subpackages under +org.apache.hadoop.sqoop+ exist:
|
|||||||
* +lib+ - The external public API (described earlier).
|
* +lib+ - The external public API (described earlier).
|
||||||
* +manager+ - The +ConnManager+ and +ManagerFactory+ interface and their
|
* +manager+ - The +ConnManager+ and +ManagerFactory+ interface and their
|
||||||
implementations.
|
implementations.
|
||||||
* +mapreduce+ - Classes interfacing with the new (0.20+) MapReduce API....
|
* +mapreduce+ - Classes interfacing with the new (0.20+) MapReduce API.
|
||||||
* +orm+ - Code auto-generation.
|
* +orm+ - Code auto-generation.
|
||||||
|
* +tool+ - Implementations of +SqoopTool+.
|
||||||
* +util+ - Miscellaneous utility classes.
|
* +util+ - Miscellaneous utility classes.
|
||||||
|
|
||||||
The +io+ package contains _OutputStream_ and _BufferedWriter_ implementations
|
The +io+ package contains _OutputStream_ and _BufferedWriter_ implementations
|
||||||
@ -185,11 +214,13 @@ BufferedWriter to be opened to a client which will, under the hood, write to
|
|||||||
multiple files in series as they reach a target threshold size. This allows
|
multiple files in series as they reach a target threshold size. This allows
|
||||||
unsplittable compression libraries (e.g., gzip) to be used in conjunction with
|
unsplittable compression libraries (e.g., gzip) to be used in conjunction with
|
||||||
Sqoop import while still allowing subsequent MapReduce jobs to use multiple
|
Sqoop import while still allowing subsequent MapReduce jobs to use multiple
|
||||||
input splits per dataset.
|
input splits per dataset. The large object file storage (see
|
||||||
|
http://wiki.github.com/cloudera/sqoop/sip-3[SIP-3]) system's code
|
||||||
|
lies in the +io+ package as well.
|
||||||
|
|
||||||
The +mapreduce+ package contains +DataDrivenImportJob+, which uses the
|
The +mapreduce+ package contains code that interfaces directly with
|
||||||
+DataDrivenDBInputFormat+ introduced in 0.21. Most +ConnManager+
|
Hadoop MapReduce. This package's contents are described in more detail
|
||||||
implementations use +DataDrivenImportJob+ to perform their imports.
|
in the next section.
|
||||||
|
|
||||||
The +orm+ package contains code used for class generation. It depends on the
|
The +orm+ package contains code used for class generation. It depends on the
|
||||||
JDK's tools.jar which provides the com.sun.tools.javac package.
|
JDK's tools.jar which provides the com.sun.tools.javac package.
|
||||||
@ -237,3 +268,29 @@ and forward the data along to HDFS, possibly performing formatting conversions
|
|||||||
in the meantime.
|
in the meantime.
|
||||||
|
|
||||||
|
|
||||||
|
Interfacing with MapReduce
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
Sqoop schedules MapReduce jobs to effect imports and exports.
|
||||||
|
Configuration and execution of MapReduce jobs follows a few common
|
||||||
|
steps (configuring the +InputFormat+; configuring the +OutputFormat+;
|
||||||
|
setting the +Mapper+ implementation; etc...). These steps are
|
||||||
|
formalized in the +org.apache.hadoop.sqoop.mapreduce.JobBase+ class.
|
||||||
|
The +JobBase+ allows a user to specify the +InputFormat+,
|
||||||
|
+OutputFormat+, and +Mapper+ to use.
|
||||||
|
|
||||||
|
+JobBase+ itself is subclassed by +ImportJobBase+ and +ExportJobBase+
|
||||||
|
which offer better support for the particular configuration steps
|
||||||
|
common to import or export-related jobs, respectively.
|
||||||
|
+ImportJobBase.runImport()+ will call the configuration steps and run
|
||||||
|
a job to import a table to HDFS.
|
||||||
|
|
||||||
|
Subclasses of these base classes exist as well. For example,
|
||||||
|
+DataDrivenImportJob+ uses the +DataDrivenDBInputFormat+ to run an
|
||||||
|
import. This is the most common type of import used by the various
|
||||||
|
+ConnManager+ implementations available. MySQL uses a different class
|
||||||
|
(+MySQLDumpImportJob+) to run a direct-mode import. Its custom
|
||||||
|
+Mapper+ and +InputFormat+ implementations reside in this package as
|
||||||
|
well.
|
||||||
|
|
||||||
|
|
32
src/docs/dev/compiling.txt
Normal file
32
src/docs/dev/compiling.txt
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
|
||||||
|
////
|
||||||
|
Licensed to Cloudera, Inc. under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
////
|
||||||
|
|
||||||
|
|
||||||
|
Compiling Sqoop from Source
|
||||||
|
---------------------------
|
||||||
|
|
||||||
|
You can obtain the source code for Sqoop at:
|
||||||
|
http://github.com/cloudera/sqoop
|
||||||
|
|
||||||
|
Sqoop source code is held in a +git+ repository. Instructions for
|
||||||
|
retrieving source from the repository are provided at:
|
||||||
|
http://wiki.github.com/cloudera/sqoop/DevelopmentProcess
|
||||||
|
|
||||||
|
Compilation instructions are provided in the +COMPILING.txt+ file in
|
||||||
|
the root of the source repository.
|
||||||
|
|
@ -20,15 +20,9 @@
|
|||||||
Introduction
|
Introduction
|
||||||
------------
|
------------
|
||||||
|
|
||||||
Sqoop is a tool designed to help users of large data import
|
If you are a developer or an application programmer who intends to
|
||||||
existing relational databases into their Hadoop clusters. Sqoop uses
|
modify Sqoop or build an extension using one of Sqoop's internal APIs,
|
||||||
JDBC to connect to a database, examine each table's schema, and
|
you should read this document. The following sections describe the
|
||||||
auto-generate the necessary classes to import data into HDFS. It
|
purpose of each API, where internal APIs are used, and which APIs are
|
||||||
then instantiates a MapReduce job to read tables from the database
|
necessary for implementing support for additional databases.
|
||||||
via the DBInputFormat (JDBC-based InputFormat). Tables are read
|
|
||||||
into a set of files loaded into HDFS. Both SequenceFile and
|
|
||||||
text-based targets are supported. Sqoop also supports high-performance
|
|
||||||
imports from select databases including MySQL.
|
|
||||||
|
|
||||||
This document describes how to get started using Sqoop to import
|
|
||||||
your data into Hadoop.
|
|
55
src/docs/dev/preface.txt
Normal file
55
src/docs/dev/preface.txt
Normal file
@ -0,0 +1,55 @@
|
|||||||
|
|
||||||
|
////
|
||||||
|
Licensed to Cloudera, Inc. under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
////
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Supported Releases
|
||||||
|
------------------
|
||||||
|
|
||||||
|
This documentation applies to Sqoop v1.0.0 (June 2010).
|
||||||
|
|
||||||
|
Sqoop Releases
|
||||||
|
--------------
|
||||||
|
|
||||||
|
Sqoop is an open source software product of Cloudera, Inc. Software
|
||||||
|
development for Sqoop occurs at http://github.com/cloudera/sqoop. At
|
||||||
|
that site, you can obtain:
|
||||||
|
|
||||||
|
- New releases of Sqoop as well as its most recent source code
|
||||||
|
- An issue tracker
|
||||||
|
- A wiki that contains Sqoop documentation
|
||||||
|
|
||||||
|
|
||||||
|
Prerequisites
|
||||||
|
-------------
|
||||||
|
|
||||||
|
The following prerequisite knowledge is required for Sqoop:
|
||||||
|
|
||||||
|
- Software development in Java
|
||||||
|
* Familiarity with JDBC
|
||||||
|
* Familiarity with Hadoop's APIs (including the "new" MapReduce API of
|
||||||
|
0.20+)
|
||||||
|
- Relational database management systems and SQL
|
||||||
|
|
||||||
|
This document assumes you are using a Linux or Linux-like environment.
|
||||||
|
If you are using Windows, you may be able to use cygwin to accomplish
|
||||||
|
most of the following tasks. If you are using Mac OS X, you should see
|
||||||
|
few (if any) compatibility errors. Sqoop is predominantly operated and
|
||||||
|
tested on Linux.
|
||||||
|
|
||||||
|
|
@ -1,77 +0,0 @@
|
|||||||
|
|
||||||
////
|
|
||||||
Licensed to Cloudera, Inc. under one or more
|
|
||||||
contributor license agreements. See the NOTICE file distributed with
|
|
||||||
this work for additional information regarding copyright ownership.
|
|
||||||
Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0
|
|
||||||
(the "License"); you may not use this file except in compliance with
|
|
||||||
the License. You may obtain a copy of the License at
|
|
||||||
|
|
||||||
http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
|
|
||||||
Unless required by applicable law or agreed to in writing, software
|
|
||||||
distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
See the License for the specific language governing permissions and
|
|
||||||
limitations under the License.
|
|
||||||
////
|
|
||||||
|
|
||||||
|
|
||||||
Direct-mode Imports
|
|
||||||
-------------------
|
|
||||||
|
|
||||||
While the JDBC-based import method used by Sqoop provides it with the
|
|
||||||
ability to read from a variety of databases using a generic driver, it
|
|
||||||
is not the most high-performance method available. Sqoop can read from
|
|
||||||
certain database systems faster by using their built-in export tools.
|
|
||||||
|
|
||||||
For example, Sqoop can read from a MySQL database by using the +mysqldump+
|
|
||||||
tool distributed with MySQL. You can take advantage of this faster
|
|
||||||
import method by running Sqoop with the +--direct+ argument. This
|
|
||||||
combined with a connect string that begins with +jdbc:mysql://+ will
|
|
||||||
inform Sqoop that it should select the faster access method.
|
|
||||||
|
|
||||||
If your delimiters exactly match the delimiters used by +mysqldump+,
|
|
||||||
then Sqoop will use a fast-path that copies the data directly from
|
|
||||||
+mysqldump+'s output into HDFS. Otherwise, Sqoop will parse +mysqldump+'s
|
|
||||||
output into fields and transcode them into the user-specified delimiter set.
|
|
||||||
This incurs additional processing, so performance may suffer.
|
|
||||||
For convenience, the +--mysql-delimiters+
|
|
||||||
argument will set all the output delimiters to be consistent with
|
|
||||||
+mysqldump+'s format.
|
|
||||||
|
|
||||||
Sqoop also provides a direct-mode backend for PostgreSQL that uses the
|
|
||||||
+COPY TO STDOUT+ protocol from +psql+. No specific delimiter set provides
|
|
||||||
better performance; Sqoop will forward delimiter control arguments to
|
|
||||||
+psql+.
|
|
||||||
|
|
||||||
The "Supported Databases" section provides a full list of database vendors
|
|
||||||
which have direct-mode support from Sqoop.
|
|
||||||
|
|
||||||
When writing to HDFS, direct mode will open a single output file to receive
|
|
||||||
the results of the import. You can instruct Sqoop to use multiple output
|
|
||||||
files by using the +--direct-split-size+ argument which takes a size in
|
|
||||||
bytes. Sqoop will generate files of approximately this size. e.g.,
|
|
||||||
+--direct-split-size 1000000+ will generate files of approximately 1 MB
|
|
||||||
each. If compressing the HDFS files with +--compress+, this will allow
|
|
||||||
subsequent MapReduce programs to use multiple mappers across your data
|
|
||||||
in parallel.
|
|
||||||
|
|
||||||
Tool-specific arguments
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
Sqoop will generate a set of command-line arguments with which it invokes
|
|
||||||
the underlying direct-mode tool (e.g., mysqldump). You can specify additional
|
|
||||||
arguments which should be passed to the tool by passing them to Sqoop
|
|
||||||
after a single '+-+' argument. e.g.:
|
|
||||||
|
|
||||||
----
|
|
||||||
$ sqoop --connect jdbc:mysql://localhost/db --table foo --direct - --lock-tables
|
|
||||||
----
|
|
||||||
|
|
||||||
The +--lock-tables+ argument (and anything else to the right of the +-+ argument)
|
|
||||||
will be passed directly to mysqldump.
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -1,76 +0,0 @@
|
|||||||
|
|
||||||
////
|
|
||||||
Licensed to Cloudera, Inc. under one or more
|
|
||||||
contributor license agreements. See the NOTICE file distributed with
|
|
||||||
this work for additional information regarding copyright ownership.
|
|
||||||
Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0
|
|
||||||
(the "License"); you may not use this file except in compliance with
|
|
||||||
the License. You may obtain a copy of the License at
|
|
||||||
|
|
||||||
http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
|
|
||||||
Unless required by applicable law or agreed to in writing, software
|
|
||||||
distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
See the License for the specific language governing permissions and
|
|
||||||
limitations under the License.
|
|
||||||
////
|
|
||||||
|
|
||||||
|
|
||||||
Exporting to a Database
|
|
||||||
-----------------------
|
|
||||||
|
|
||||||
In addition to importing database tables into HDFS, Sqoop can also
|
|
||||||
work in "reverse," reading the contents of a file or directory in
|
|
||||||
HDFS, interpreting the data as database rows, and inserting them
|
|
||||||
into a specified database table.
|
|
||||||
|
|
||||||
To run an export, invoke Sqoop with the +--export-dir+ and
|
|
||||||
+--table+ options. e.g.:
|
|
||||||
|
|
||||||
----
|
|
||||||
$ sqoop --connect jdbc:mysql://db.example.com/foo --table bar \
|
|
||||||
--export-dir /results/bar_data
|
|
||||||
----
|
|
||||||
|
|
||||||
This will take the files in +/results/bar_data+ and inject their
|
|
||||||
contents in to the +bar+ table in the +foo+ database on +db.example.com+.
|
|
||||||
The target table must already exist in the database. Sqoop will perform
|
|
||||||
a set of +INSERT INTO+ operations, without regard for existing content. If
|
|
||||||
Sqoop attempts to insert rows which violate constraints in the database
|
|
||||||
(e.g., a particular primary key value already exists), then the export
|
|
||||||
will fail.
|
|
||||||
|
|
||||||
As in import mode, Sqoop will auto-generate an interoperability class
|
|
||||||
to use with the particular table in question. This will be used to parse
|
|
||||||
the records in HDFS files before loading their contents into the database.
|
|
||||||
You must specify the same delimiters (e.g., with +--fields-terminated-by+,
|
|
||||||
etc.) as are used in the files to export in order to parse the data
|
|
||||||
correctly. If your data is stored in SequenceFiles (created with an import
|
|
||||||
in the +--as-sequencefile+ format), then you do not need to specify
|
|
||||||
delimiters.
|
|
||||||
|
|
||||||
If you have an existing auto-generated jar and class that you intend to use
|
|
||||||
with Sqoop, you can specify these with the +--jar-file+ and +--class-name+
|
|
||||||
parameters. Providing these options will disable autogeneration of a new
|
|
||||||
class based on the target table.
|
|
||||||
|
|
||||||
|
|
||||||
Exporting to MySQL
|
|
||||||
~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
MySQL supports a direct mode for exports. If the +--direct+ argument is given
|
|
||||||
when exporting to a MySQL database, Sqoop will use instances of +mysqlimport+
|
|
||||||
to manage the export process.
|
|
||||||
|
|
||||||
For performance, each writer will commit approximately every 32 MB of exported
|
|
||||||
data. This can be controlled by passing the following argument _before_ any
|
|
||||||
named parameters: +-D sqoop.mysql.export.checkpoint.bytes=_size_+, where _size_
|
|
||||||
is a value in bytes. Setting _size_ to 0 will disable intermediate checkpoints,
|
|
||||||
although individual files being exported will continue to be committed
|
|
||||||
independently of one another.
|
|
||||||
|
|
||||||
IMPORTANT: Note that any arguments to Sqoop that are of the form
|
|
||||||
+-D parameter=value+ must appear before any named arguments (e.g., +--connect+,
|
|
||||||
+--table+, etc).
|
|
||||||
|
|
@ -1,92 +0,0 @@
|
|||||||
|
|
||||||
////
|
|
||||||
Licensed to Cloudera, Inc. under one or more
|
|
||||||
contributor license agreements. See the NOTICE file distributed with
|
|
||||||
this work for additional information regarding copyright ownership.
|
|
||||||
Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0
|
|
||||||
(the "License"); you may not use this file except in compliance with
|
|
||||||
the License. You may obtain a copy of the License at
|
|
||||||
|
|
||||||
http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
|
|
||||||
Unless required by applicable law or agreed to in writing, software
|
|
||||||
distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
See the License for the specific language governing permissions and
|
|
||||||
limitations under the License.
|
|
||||||
////
|
|
||||||
|
|
||||||
|
|
||||||
Automatic Full-database Import
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
If you want to import all the tables in a database, you can use the
|
|
||||||
+--all-tables+ command to do so:
|
|
||||||
|
|
||||||
----
|
|
||||||
$ sqoop --connect jdbc:mysql://database.example.com/employees --all-tables
|
|
||||||
----
|
|
||||||
|
|
||||||
This will query the database for the available tables, generate an ORM
|
|
||||||
class for each table, and run a MapReduce job to import each one.
|
|
||||||
Hadoop uses the DBInputFormat to read from a database into a Mapper
|
|
||||||
instance. To read a table into a MapReduce program requires creating a
|
|
||||||
class to hold the fields of one row of the table. One of the benefits
|
|
||||||
of Sqoop is that it generates this class definition for you, based on
|
|
||||||
the table definition in the database.
|
|
||||||
|
|
||||||
The generated +.java+ files are, by default, placed in the current
|
|
||||||
directory. You can supply a different directory with the +--outdir+
|
|
||||||
parameter. These are then compiled into +.class+ and +.jar+ files for use
|
|
||||||
by the MapReduce job that it launches. These files are created in a
|
|
||||||
temporary directory. You can redirect this target with +--bindir+.
|
|
||||||
|
|
||||||
Each table will be imported into a separate directory in HDFS, with
|
|
||||||
the same name as the table. For instance, if my Hadoop username is
|
|
||||||
aaron, the above command would have generated the following
|
|
||||||
directories in HDFS:
|
|
||||||
|
|
||||||
----
|
|
||||||
/user/aaron/employee_names
|
|
||||||
/user/aaron/payroll_checks
|
|
||||||
/user/aaron/job_descriptions
|
|
||||||
/user/aaron/office_supplies
|
|
||||||
----
|
|
||||||
|
|
||||||
You can change the base directory under which the tables are loaded
|
|
||||||
with the +--warehouse-dir+ parameter. For example:
|
|
||||||
|
|
||||||
----
|
|
||||||
$ sqoop --connect jdbc:mysql://database.example.com/employees --all-tables \
|
|
||||||
--warehouse-dir /common/warehouse
|
|
||||||
----
|
|
||||||
|
|
||||||
This would create the following directories instead:
|
|
||||||
|
|
||||||
----
|
|
||||||
/common/warehouse/employee_names
|
|
||||||
/common/warehouse/payroll_checks
|
|
||||||
/common/warehouse/job_descriptions
|
|
||||||
/common/warehouse/office_supplies
|
|
||||||
----
|
|
||||||
|
|
||||||
By default the data will be read into text files in HDFS. Each of the
|
|
||||||
columns will be represented as comma-delimited text. Each row is
|
|
||||||
terminated by a newline. See the section on "Controlling the Output
|
|
||||||
Format" below for information on how to change these delimiters.
|
|
||||||
|
|
||||||
If you want to leverage compression and binary file formats, the
|
|
||||||
+--as-sequencefile+ argument to Sqoop will import the table
|
|
||||||
to a set of SequenceFiles instead. This stores each field of each
|
|
||||||
database record in a separate object in a SequenceFile.
|
|
||||||
This representation is also likely to be higher performance when used
|
|
||||||
as an input to subsequent MapReduce programs as it does not require
|
|
||||||
parsing. For completeness, Sqoop provides an +--as-textfile+ option, which is
|
|
||||||
implied by default. An +--as-textfile+ on the command-line will override
|
|
||||||
a previous +--as-sequencefile+ argument.
|
|
||||||
|
|
||||||
The SequenceFile format will embed the records from the database as
|
|
||||||
objects using the code generated by Sqoop. It is important that you
|
|
||||||
retain the +.java+ file for this class, as you will need to be able to
|
|
||||||
instantiate the same type to read the objects back later, in other
|
|
||||||
user-defined applications.
|
|
||||||
|
|
@ -1,72 +0,0 @@
|
|||||||
|
|
||||||
////
|
|
||||||
Licensed to Cloudera, Inc. under one or more
|
|
||||||
contributor license agreements. See the NOTICE file distributed with
|
|
||||||
this work for additional information regarding copyright ownership.
|
|
||||||
Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0
|
|
||||||
(the "License"); you may not use this file except in compliance with
|
|
||||||
the License. You may obtain a copy of the License at
|
|
||||||
|
|
||||||
http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
|
|
||||||
Unless required by applicable law or agreed to in writing, software
|
|
||||||
distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
See the License for the specific language governing permissions and
|
|
||||||
limitations under the License.
|
|
||||||
////
|
|
||||||
|
|
||||||
|
|
||||||
Importing Data Into Hive
|
|
||||||
------------------------
|
|
||||||
|
|
||||||
Sqoop's primary function is to upload your data into files in HDFS. If
|
|
||||||
you have a Hive metastore associated with your HDFS cluster, Sqoop can
|
|
||||||
also import the data into Hive by generating and executing a +CREATE
|
|
||||||
TABLE+ statement to define the data's layout in Hive. Importing data
|
|
||||||
into Hive is as simple as adding the *+--hive-import+* option to your
|
|
||||||
Sqoop command line.
|
|
||||||
|
|
||||||
By default the data is imported into HDFS, but you can skip this operation
|
|
||||||
by using the *+--hive-create+* option. Optionally, you can specify the
|
|
||||||
*+--hive-overwrite+* option to indicate that existing table in hive must
|
|
||||||
be replaced. After your data is imported into HDFS or this step is
|
|
||||||
omitted, Sqoop will generate a Hive script containing a +CREATE TABLE+
|
|
||||||
operation defining your columns using Hive's types, and a +LOAD DATA INPATH+
|
|
||||||
statement to move the data files into Hive's warehouse directory if
|
|
||||||
*+--hive-create+* option is not added. The script will be executed by calling
|
|
||||||
the installed copy of hive on the machine where Sqoop is run. If you have
|
|
||||||
multiple Hive installations, or +hive+ is not in your +$PATH+ use the
|
|
||||||
*+--hive-home+* option to identify the Hive installation directory.
|
|
||||||
Sqoop will use +$HIVE_HOME/bin/hive+ from here.
|
|
||||||
|
|
||||||
NOTE: This function is incompatible with +--as-sequencefile+.
|
|
||||||
|
|
||||||
Hive's text parser does not know how to support escaping or enclosing
|
|
||||||
characters. Sqoop will print a warning if you use +--escaped-by+,
|
|
||||||
+--enclosed-by+, or +--optionally-enclosed-by+ since Hive does not know
|
|
||||||
how to parse these. It will pass the field and record terminators through
|
|
||||||
to Hive. If you do not set any delimiters and do use +--hive-import+,
|
|
||||||
the field delimiter will be set to +^A+ and the record delimiter will
|
|
||||||
be set to +\n+ to be consistent with Hive's defaults.
|
|
||||||
|
|
||||||
The table name used in Hive is, by default, the same as that of the
|
|
||||||
source table. You can control the output table name with the +--hive-table+
|
|
||||||
option.
|
|
||||||
|
|
||||||
If Hive import commands are used in conjunction with the +--generate-only+
|
|
||||||
option, then a Hive import will not occur. Instead, the DDL commands to
|
|
||||||
perform the import from HDFS to Hive are written to a file named +_tableName_.q+
|
|
||||||
which you can then execute with +hive -f+ after the data is brought into
|
|
||||||
HDFS.
|
|
||||||
|
|
||||||
Hive's Type System
|
|
||||||
~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
Hive users will note that there is not a one-to-one mapping between
|
|
||||||
SQL types and Hive types. In general, SQL types that do not have a
|
|
||||||
direct mapping (e.g., +DATE+, +TIME+, and +TIMESTAMP+) will be coerced to
|
|
||||||
+STRING+ in Hive. The +NUMERIC+ and +DECIMAL+ SQL types will be coerced to
|
|
||||||
+DOUBLE+. In these cases, Sqoop will emit a warning in its log messages
|
|
||||||
informing you of the loss of precision.
|
|
||||||
|
|
@ -1,48 +0,0 @@
|
|||||||
|
|
||||||
////
|
|
||||||
Licensed to Cloudera, Inc. under one or more
|
|
||||||
contributor license agreements. See the NOTICE file distributed with
|
|
||||||
this work for additional information regarding copyright ownership.
|
|
||||||
Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0
|
|
||||||
(the "License"); you may not use this file except in compliance with
|
|
||||||
the License. You may obtain a copy of the License at
|
|
||||||
|
|
||||||
http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
|
|
||||||
Unless required by applicable law or agreed to in writing, software
|
|
||||||
distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
See the License for the specific language governing permissions and
|
|
||||||
limitations under the License.
|
|
||||||
////
|
|
||||||
|
|
||||||
|
|
||||||
Miscellaneous Additional Arguments
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
If you want to generate the Java classes to represent tables without
|
|
||||||
actually performing an import, supply a connect string and
|
|
||||||
(optionally) credentials as above, as well as +--all-tables+ or
|
|
||||||
+--table+, but also use the +--generate-only+ argument. This will
|
|
||||||
generate the classes and cease further operation.
|
|
||||||
|
|
||||||
You can override the +$HADOOP_HOME+ environment variable within Sqoop
|
|
||||||
with the +--hadoop-home+ argument. You can override the +$HIVE_HOME+
|
|
||||||
environment variable with +--hive-home+.
|
|
||||||
|
|
||||||
Data emitted to HDFS is by default uncompressed. You can instruct
|
|
||||||
Sqoop to use gzip to compress your data by providing either the
|
|
||||||
+--compress+ or +-z+ argument (both are equivalent).
|
|
||||||
|
|
||||||
Small CLOB and BLOB values will be imported as string-based data inline
|
|
||||||
with the rest of their containing record. Over a size threshold (by
|
|
||||||
default, 16 MB per object), these values will not be materialized directly,
|
|
||||||
inline, and will be written to external files in HDFS; the inline records
|
|
||||||
will contain pointers to these files. The inline materialization limit can
|
|
||||||
be controlled with the +--inline-lob-limit+ argument; the limit itself is
|
|
||||||
specified in bytes.
|
|
||||||
|
|
||||||
Using +--verbose+ will instruct Sqoop to print more details about its
|
|
||||||
operation; this is particularly handy if Sqoop appears to be misbehaving.
|
|
||||||
|
|
||||||
|
|
@ -1,55 +0,0 @@
|
|||||||
|
|
||||||
////
|
|
||||||
Licensed to Cloudera, Inc. under one or more
|
|
||||||
contributor license agreements. See the NOTICE file distributed with
|
|
||||||
this work for additional information regarding copyright ownership.
|
|
||||||
Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0
|
|
||||||
(the "License"); you may not use this file except in compliance with
|
|
||||||
the License. You may obtain a copy of the License at
|
|
||||||
|
|
||||||
http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
|
|
||||||
Unless required by applicable law or agreed to in writing, software
|
|
||||||
distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
See the License for the specific language governing permissions and
|
|
||||||
limitations under the License.
|
|
||||||
////
|
|
||||||
|
|
||||||
|
|
||||||
Supported Databases
|
|
||||||
-------------------
|
|
||||||
|
|
||||||
Sqoop uses JDBC to connect to databases. JDBC is a compatibility layer
|
|
||||||
that allows a program to access many different databases through a common
|
|
||||||
API. Slight differences in the SQL language spoken by each database, however,
|
|
||||||
may mean that Sqoop can't use every database out of the box, or that some
|
|
||||||
databases may be used in an inefficient manner.
|
|
||||||
|
|
||||||
When you provide a connect string to Sqoop, it inspects the protocol scheme to
|
|
||||||
determine appropriate vendor-specific logic to use. If Sqoop knows about
|
|
||||||
a given database, it will work automatically. If not, you may need to
|
|
||||||
specify the driver class to load via +--driver+. This will use a generic
|
|
||||||
code path which will use standard SQL to access the database. Sqoop provides
|
|
||||||
some databases with faster, non-JDBC-based access mechanisms. These can be
|
|
||||||
enabled by specfying the +--direct+ parameter.
|
|
||||||
|
|
||||||
Sqoop includes vendor-specific code paths for the following databases:
|
|
||||||
|
|
||||||
[grid="all"]
|
|
||||||
`-----------`--------`--------------------`---------------------
|
|
||||||
Database version +--direct+ support? connect string matches
|
|
||||||
----------------------------------------------------------------
|
|
||||||
HSQLDB 1.8.0+ No +jdbc:hsqldb:*//+
|
|
||||||
MySQL 5.0+ Yes +jdbc:mysql://+
|
|
||||||
Oracle 10.2.0+ No +jdbc:oracle:*//+
|
|
||||||
PostgreSQL 8.3+ Yes (import only) +jdbc:postgresql://+
|
|
||||||
----------------------------------------------------------------
|
|
||||||
|
|
||||||
Sqoop may work with older versions of the databases listed, but we have
|
|
||||||
only tested it with the versions specified above.
|
|
||||||
|
|
||||||
Even if Sqoop supports a database internally, you may still need to
|
|
||||||
install the database vendor's JDBC driver in your +$HADOOP_HOME/lib+
|
|
||||||
path.
|
|
||||||
|
|
@ -1,68 +0,0 @@
|
|||||||
|
|
||||||
////
|
|
||||||
Licensed to Cloudera, Inc. under one or more
|
|
||||||
contributor license agreements. See the NOTICE file distributed with
|
|
||||||
this work for additional information regarding copyright ownership.
|
|
||||||
Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0
|
|
||||||
(the "License"); you may not use this file except in compliance with
|
|
||||||
the License. You may obtain a copy of the License at
|
|
||||||
|
|
||||||
http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
|
|
||||||
Unless required by applicable law or agreed to in writing, software
|
|
||||||
distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
See the License for the specific language governing permissions and
|
|
||||||
limitations under the License.
|
|
||||||
////
|
|
||||||
|
|
||||||
|
|
||||||
Importing Individual Tables
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
In addition to full-database imports, Sqoop will allow you to import
|
|
||||||
individual tables. Instead of using +--all-tables+, specify the name of
|
|
||||||
a particular table with the +--table+ argument:
|
|
||||||
|
|
||||||
----
|
|
||||||
$ sqoop --connect jdbc:mysql://database.example.com/employees \
|
|
||||||
--table employee_names
|
|
||||||
----
|
|
||||||
|
|
||||||
You can further specify a subset of the columns in a table by using
|
|
||||||
the +--columns+ argument. This takes a list of column names, delimited
|
|
||||||
by commas, with no spaces in between. e.g.:
|
|
||||||
|
|
||||||
----
|
|
||||||
$ sqoop --connect jdbc:mysql://database.example.com/employees \
|
|
||||||
--table employee_names --columns employee_id,first_name,last_name,dept_id
|
|
||||||
----
|
|
||||||
|
|
||||||
Sqoop will use a MapReduce job to read sections of the table in
|
|
||||||
parallel. For the MapReduce tasks to divide the table space, the
|
|
||||||
results returned by the database must be orderable. Sqoop will
|
|
||||||
automatically detect the primary key for a table and use that to order
|
|
||||||
the results. If no primary key is available, or (less likely) you want
|
|
||||||
to order the results along a different column, you can specify the
|
|
||||||
column name with +--split-by+.
|
|
||||||
|
|
||||||
.Row ordering
|
|
||||||
IMPORTANT: To guarantee correctness of your input, you must select an
|
|
||||||
ordering column for which each row has a unique value. If duplicate
|
|
||||||
values appear in the ordering column, the results of the import are
|
|
||||||
undefined, and Sqoop will not be able to detect the error.
|
|
||||||
|
|
||||||
Finally, you can control which rows of a table are imported via the
|
|
||||||
+--where+ argument. With this argument, you may specify a clause to be
|
|
||||||
appended to the SQL statement used to select rows from the table,
|
|
||||||
e.g.:
|
|
||||||
|
|
||||||
----
|
|
||||||
$ sqoop --connect jdbc:mysql://database.example.com/employees \
|
|
||||||
--table employee_names --where "employee_id > 40 AND active = 1"
|
|
||||||
----
|
|
||||||
|
|
||||||
The +--columns+, +--split-by+, and +--where+ arguments are incompatible with
|
|
||||||
+--all-tables+. If you require special handling for some of the tables,
|
|
||||||
then you must manually run a separate import job for each table.
|
|
||||||
|
|
52
src/docs/user/SqoopUserGuide.txt
Normal file
52
src/docs/user/SqoopUserGuide.txt
Normal file
@ -0,0 +1,52 @@
|
|||||||
|
|
||||||
|
////
|
||||||
|
Licensed to Cloudera, Inc. under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
////
|
||||||
|
|
||||||
|
|
||||||
|
include::intro.txt[]
|
||||||
|
|
||||||
|
include::preface.txt[]
|
||||||
|
|
||||||
|
include::basics.txt[]
|
||||||
|
|
||||||
|
include::tools.txt[]
|
||||||
|
|
||||||
|
include::import.txt[]
|
||||||
|
|
||||||
|
include::import-all-tables.txt[]
|
||||||
|
|
||||||
|
include::export.txt[]
|
||||||
|
|
||||||
|
include::codegen.txt[]
|
||||||
|
|
||||||
|
include::create-hive-table.txt[]
|
||||||
|
|
||||||
|
include::eval.txt[]
|
||||||
|
|
||||||
|
include::list-databases.txt[]
|
||||||
|
|
||||||
|
include::list-tables.txt[]
|
||||||
|
|
||||||
|
include::help.txt[]
|
||||||
|
|
||||||
|
include::version.txt[]
|
||||||
|
|
||||||
|
include::compatibility.txt[]
|
||||||
|
|
||||||
|
include::support.txt[]
|
||||||
|
|
||||||
|
|
63
src/docs/user/basics.txt
Normal file
63
src/docs/user/basics.txt
Normal file
@ -0,0 +1,63 @@
|
|||||||
|
|
||||||
|
////
|
||||||
|
Licensed to Cloudera, Inc. under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
////
|
||||||
|
|
||||||
|
Basic Usage
|
||||||
|
-----------
|
||||||
|
|
||||||
|
With Sqoop, you can _import_ data from a relational database system into
|
||||||
|
HDFS. The input to the import process is a database table. Sqoop
|
||||||
|
will read the table row-by-row into HDFS. The output of this import
|
||||||
|
process is a set of files containing a copy of the imported table.
|
||||||
|
The import process is performed in parallel. For this reason, the
|
||||||
|
output will be in multiple files. These files may be delimited text
|
||||||
|
files (for example, with commas or tabs separating each field), or
|
||||||
|
binary SequenceFiles containing serialized record data.
|
||||||
|
|
||||||
|
A by-product of the import process is a generated Java class which
|
||||||
|
can encapsulate one row of the imported table. This class is used
|
||||||
|
during the import process by Sqoop itself. The Java source code for
|
||||||
|
this class is also provided to you, for use in subsequent MapReduce
|
||||||
|
processing of the data. This class can serialize and deserialize data
|
||||||
|
to and from the SequenceFile format. It can also parse the
|
||||||
|
delimited-text form of a record. These abilities allow you to quickly
|
||||||
|
develop MapReduce applications that use the HDFS-stored records in
|
||||||
|
your processing pipeline. You are also free to parse the delimiteds
|
||||||
|
record data yourself, using any other tools you prefer.
|
||||||
|
|
||||||
|
After manipulating the imported records (for example, with MapReduce
|
||||||
|
or Hive) you may have a result data set which you can then _export_
|
||||||
|
back to the relational database. Sqoop's export process will read
|
||||||
|
a set of delimited text files from HDFS in parallel, parse them into
|
||||||
|
records, and insert them as new rows in a target database table, for
|
||||||
|
consumption by external applications or users.
|
||||||
|
|
||||||
|
Sqoop includes some other commands which allow you to inspect the
|
||||||
|
database you are working with. For example, you can list the available
|
||||||
|
database schemas (with the +sqoop-list-databases+ tool) and tables
|
||||||
|
within a schema (with the +sqoop-list-tables+ tool). Sqoop also
|
||||||
|
includes a primitive SQL execution shell (the +sqoop-eval+ tool).
|
||||||
|
|
||||||
|
Most aspects of the import, code generation, and export processes can
|
||||||
|
be customized. You can control the specific row range or columns imported.
|
||||||
|
You can specify particular delimiters and escape characters for the
|
||||||
|
file-based representation of the data, as well as the file format
|
||||||
|
used. You can also control the class or package names used in
|
||||||
|
generated code. Subsequent sections of this document explain how to
|
||||||
|
specify these and other arguments to Sqoop.
|
||||||
|
|
||||||
|
|
33
src/docs/user/codegen-args.txt
Normal file
33
src/docs/user/codegen-args.txt
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
|
||||||
|
////
|
||||||
|
Licensed to Cloudera, Inc. under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
////
|
||||||
|
|
||||||
|
|
||||||
|
.Code generation arguments:
|
||||||
|
[grid="all"]
|
||||||
|
`------------------------`-----------------------------------------------
|
||||||
|
Argument Description
|
||||||
|
-------------------------------------------------------------------------
|
||||||
|
+\--bindir <dir>+ Output directory for compiled objects
|
||||||
|
+\--class-name <name>+ Sets the generated class name. This overrides\
|
||||||
|
+\--package-name+. When combined with \
|
||||||
|
+\--jar-file+, sets the input class.
|
||||||
|
+\--jar-file <file>+ Disable code generation; use specified jar
|
||||||
|
+\--outdir <dir>+ Output directory for generated code
|
||||||
|
+\--package-name <name>+ Put auto-generated classes in this package
|
||||||
|
-------------------------------------------------------------------------
|
||||||
|
|
83
src/docs/user/codegen.txt
Normal file
83
src/docs/user/codegen.txt
Normal file
@ -0,0 +1,83 @@
|
|||||||
|
|
||||||
|
////
|
||||||
|
Licensed to Cloudera, Inc. under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
////
|
||||||
|
|
||||||
|
|
||||||
|
+sqoop-codegen+
|
||||||
|
---------------
|
||||||
|
|
||||||
|
|
||||||
|
Purpose
|
||||||
|
~~~~~~~
|
||||||
|
|
||||||
|
The +codegen+ tool generates Java classes which encapsulate and
|
||||||
|
interpret imported records. The Java definition of a record is
|
||||||
|
instantiated as part of the import process, but can also be performed
|
||||||
|
separately. For example, if Java source is lost, it can be recreated.
|
||||||
|
New versions of a class can be created which use different delimiters
|
||||||
|
between fields, and so on.
|
||||||
|
|
||||||
|
Syntax
|
||||||
|
~~~~~~
|
||||||
|
|
||||||
|
----
|
||||||
|
$ sqoop codegen (generic-args) (codegen-args)
|
||||||
|
$ sqoop-codegen (generic-args) (codegen-args)
|
||||||
|
----
|
||||||
|
|
||||||
|
Although the Hadoop generic arguments must preceed any codegen arguments,
|
||||||
|
the codegen arguments can be entered in any order with respect to one
|
||||||
|
another.
|
||||||
|
|
||||||
|
|
||||||
|
include::common-args.txt[]
|
||||||
|
|
||||||
|
.Code generation arguments:
|
||||||
|
[grid="all"]
|
||||||
|
`------------------------`-----------------------------------------------
|
||||||
|
Argument Description
|
||||||
|
-------------------------------------------------------------------------
|
||||||
|
+\--bindir <dir>+ Output directory for compiled objects
|
||||||
|
+\--class-name <name>+ Sets the generated class name. This overrides\
|
||||||
|
+\--package-name+.
|
||||||
|
+\--outdir <dir>+ Output directory for generated code
|
||||||
|
+\--package-name <name>+ Put auto-generated classes in this package
|
||||||
|
+\--table <table-name>+ Name of the table to generate code for.
|
||||||
|
-------------------------------------------------------------------------
|
||||||
|
|
||||||
|
include::output-args.txt[]
|
||||||
|
|
||||||
|
include::input-args.txt[]
|
||||||
|
|
||||||
|
include::hive-args.txt[]
|
||||||
|
|
||||||
|
If Hive arguments are provided to the code generation tool, Sqoop
|
||||||
|
generates a file containing the HQL statements to create a table and
|
||||||
|
load data.
|
||||||
|
|
||||||
|
Example Invocations
|
||||||
|
~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
Recreate the record interpretation code for the +employees+ table of a
|
||||||
|
corporate database:
|
||||||
|
|
||||||
|
----
|
||||||
|
$ sqoop codegen --connect jdbc:mysql://db.example.com/corp \
|
||||||
|
--table employees
|
||||||
|
----
|
||||||
|
|
||||||
|
|
33
src/docs/user/common-args.txt
Normal file
33
src/docs/user/common-args.txt
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
|
||||||
|
////
|
||||||
|
Licensed to Cloudera, Inc. under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
////
|
||||||
|
|
||||||
|
.Common arguments
|
||||||
|
[grid="all"]
|
||||||
|
`-------------------------`------------------------------------------
|
||||||
|
Argument Description
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
+\--connect <jdbc-uri>+ Specify JDBC connect string
|
||||||
|
+\--driver <class-name>+ Manually specify JDBC driver class to use
|
||||||
|
+\--hadoop-home <dir>+ Override $HADOOP_HOME
|
||||||
|
+\--help+ Print usage instructions
|
||||||
|
+-P+ Read password from console
|
||||||
|
+\--password <password>+ Set authentication password
|
||||||
|
+\--username <username>+ Set authentication username
|
||||||
|
+\--verbose+ Print more information while working
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
|
184
src/docs/user/compatibility.txt
Normal file
184
src/docs/user/compatibility.txt
Normal file
@ -0,0 +1,184 @@
|
|||||||
|
|
||||||
|
////
|
||||||
|
Licensed to Cloudera, Inc. under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
////
|
||||||
|
|
||||||
|
Compatibility Notes
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
Sqoop uses JDBC to connect to databases and adheres to
|
||||||
|
published standards as much as possible. For databases which do not
|
||||||
|
support standards-compliant SQL, Sqoop uses alternate codepaths to
|
||||||
|
provide functionality. In general, Sqoop is believed to be compatible
|
||||||
|
with a large number of databases, but it is tested with only a few.
|
||||||
|
|
||||||
|
Nonetheless, several database-specific decisions were made in the
|
||||||
|
implementation of Sqoop, and some databases offer additional settings
|
||||||
|
which are extensions to the standard.
|
||||||
|
|
||||||
|
This section describes the databases tested with Sqoop, any
|
||||||
|
exceptions in Sqoop's handling of each database relative to the
|
||||||
|
norm, and any database-specific settings available in Sqoop.
|
||||||
|
|
||||||
|
Supported Databases
|
||||||
|
~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
While JDBC is a compatibility layer that allows a program to access
|
||||||
|
many different databases through a common API, slight differences in
|
||||||
|
the SQL language spoken by each database may mean that Sqoop can't use
|
||||||
|
every database out of the box, or that some databases may be used in
|
||||||
|
an inefficient manner.
|
||||||
|
|
||||||
|
When you provide a connect string to Sqoop, it inspects the protocol scheme to
|
||||||
|
determine appropriate vendor-specific logic to use. If Sqoop knows about
|
||||||
|
a given database, it will work automatically. If not, you may need to
|
||||||
|
specify the driver class to load via +\--driver+. This will use a generic
|
||||||
|
code path which will use standard SQL to access the database. Sqoop provides
|
||||||
|
some databases with faster, non-JDBC-based access mechanisms. These can be
|
||||||
|
enabled by specfying the +\--direct+ parameter.
|
||||||
|
|
||||||
|
Sqoop includes vendor-specific support for the following databases:
|
||||||
|
|
||||||
|
[grid="all"]
|
||||||
|
`-----------`--------`--------------------`---------------------
|
||||||
|
Database version +\--direct+ support? connect string matches
|
||||||
|
----------------------------------------------------------------
|
||||||
|
HSQLDB 1.8.0+ No +jdbc:hsqldb:*//+
|
||||||
|
MySQL 5.0+ Yes +jdbc:mysql://+
|
||||||
|
Oracle 10.2.0+ No +jdbc:oracle:*//+
|
||||||
|
PostgreSQL 8.3+ Yes (import only) +jdbc:postgresql://+
|
||||||
|
----------------------------------------------------------------
|
||||||
|
|
||||||
|
Sqoop may work with older versions of the databases listed, but we have
|
||||||
|
only tested it with the versions specified above.
|
||||||
|
|
||||||
|
Even if Sqoop supports a database internally, you may still need to
|
||||||
|
install the database vendor's JDBC driver in your +$HADOOP_HOME/lib+
|
||||||
|
path.
|
||||||
|
|
||||||
|
MySQL
|
||||||
|
~~~~~
|
||||||
|
|
||||||
|
JDBC Driver: http://www.mysql.com/downloads/connector/j/[MySQL
|
||||||
|
Connector/J]
|
||||||
|
|
||||||
|
MySQL v5.0 and above offers very thorough coverage by Sqoop. In builds
|
||||||
|
of Sqoop included with Cloudera's Distribution for Hadoop, the
|
||||||
|
Connector/J JDBC driver is included with the installation.
|
||||||
|
|
||||||
|
zeroDateTimeBehavior
|
||||||
|
^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
MySQL allows values of +'0000-00-00'+ for +DATE+ columns, which is a
|
||||||
|
non-standard extension to SQL. When communicated via JDBC, these
|
||||||
|
values are handled in one of three different ways:
|
||||||
|
|
||||||
|
- Convert to +NULL+.
|
||||||
|
- Throw an exception in the client.
|
||||||
|
- Round to the nearest legal date (+'0001-01-01'+).
|
||||||
|
|
||||||
|
You specify the behavior by using the +zeroDateTimeBehavior+
|
||||||
|
property of the connect string. If a +zeroDateTimeBehavior+ property
|
||||||
|
is not specified, Sqoop uses the +convertToNull+ behavior.
|
||||||
|
|
||||||
|
You can override this behavior. For example:
|
||||||
|
|
||||||
|
----
|
||||||
|
$ sqoop import --table foo \
|
||||||
|
--connect jdbc:mysql://db.example.com/someDb?zeroDateTimeBehavior=round
|
||||||
|
----
|
||||||
|
|
||||||
|
+UNSIGNED+ columns
|
||||||
|
^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
Columns with type +UNSIGNED+ in MySQL can hold values between 0 and
|
||||||
|
2^32 (+4294967295+), but the database will report the data type to Sqoop
|
||||||
|
as +INTEGER+, which will can hold values between +-2147483648+ and
|
||||||
|
+\+2147483647+. Sqoop cannot currently import +UNSIGNED+ values above
|
||||||
|
+2147483647+.
|
||||||
|
|
||||||
|
+BLOB+ and +CLOB+ columns
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
Sqoop's direct mode does not support imports of +BLOB+, +CLOB+, or
|
||||||
|
+LONGVARBINARY+ columns. Use JDBC-based imports for these
|
||||||
|
columns; do not supply the +\--direct+ argument to the import tool.
|
||||||
|
|
||||||
|
|
||||||
|
Direct-mode Transactions
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
For performance, each writer will commit the current transaction
|
||||||
|
approximately every 32 MB of exported data. You can control this
|
||||||
|
by specifying the following argument _before_ any tool-specific arguments: +-D
|
||||||
|
sqoop.mysql.export.checkpoint.bytes=size+, where _size_ is a value in
|
||||||
|
bytes. Set _size_ to 0 to disable intermediate checkpoints,
|
||||||
|
but individual files being exported will continue to be committed
|
||||||
|
independently of one another.
|
||||||
|
|
||||||
|
IMPORTANT: Note that any arguments to Sqoop that are of the form +-D
|
||||||
|
parameter=value+ are Hadoop _generic arguments_ and must appear before
|
||||||
|
any tool-specific arguments (for example, +\--connect+, +\--table+, etc).
|
||||||
|
|
||||||
|
|
||||||
|
Oracle
|
||||||
|
~~~~~~
|
||||||
|
|
||||||
|
JDBC Driver:
|
||||||
|
http://www.oracle.com/technology/software/tech/java/sqlj_jdbc/htdocs/jdbc_112010.html[Oracle
|
||||||
|
JDBC Thin Driver] - Sqoop is compatible with +ojdbc6.jar+.
|
||||||
|
|
||||||
|
Sqoop has been tested with Oracle 10.2.0 Express Edition. Oracle is
|
||||||
|
notable in its different approach to SQL from the ANSI standard, and
|
||||||
|
its non-standard JDBC driver. Therefore, several features work
|
||||||
|
differently.
|
||||||
|
|
||||||
|
Dates and Times
|
||||||
|
^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
Oracle JDBC represents +DATE+ and +TIME+ SQL types as +TIMESTAMP+
|
||||||
|
values. Any +DATE+ columns in an Oracle database will be imported as a
|
||||||
|
+TIMESTAMP+ in Sqoop, and Sqoop-generated code will store these values
|
||||||
|
in +java.sql.Timestamp+ fields.
|
||||||
|
|
||||||
|
When exporting data back to a database, Sqoop parses text fields as
|
||||||
|
+TIMESTAMP+ types (with the form +yyyy-mm-dd HH:MM:SS.ffffffff+) even
|
||||||
|
if you expect these fields to be formatted with the JDBC date escape
|
||||||
|
format of +yyyy-mm-dd+. Dates exported to Oracle should be formatted
|
||||||
|
as full timestamps.
|
||||||
|
|
||||||
|
Oracle also includes the additional date/time types +TIMESTAMP WITH
|
||||||
|
TIMEZONE+ and +TIMESTAMP WITH LOCAL TIMEZONE+. To support these types,
|
||||||
|
the user's session timezone must be specified. By default, Sqoop will
|
||||||
|
specify the timezone +"GMT"+ to Oracle. You can override this setting
|
||||||
|
by specifying a Hadoop property +oracle.sessionTimeZone+ on the
|
||||||
|
command-line when running a Sqoop job. For example:
|
||||||
|
|
||||||
|
----
|
||||||
|
$ sqoop import -D oracle.sessionTimeZone=America/Los_Angeles \
|
||||||
|
--connect jdbc:oracle:thin:@//db.example.com/foo --table bar
|
||||||
|
----
|
||||||
|
|
||||||
|
Note that Hadoop parameters (+-D ...+) are _generic arguments_ and
|
||||||
|
must appear before the tool-specific arguments (+\--connect+,
|
||||||
|
+\--table+, and so on).
|
||||||
|
|
||||||
|
Legal values for the session timezone string are enumerated at
|
||||||
|
http://download-west.oracle.com/docs/cd/B19306_01/server.102/b14225/applocaledata.htm#i637736[].
|
||||||
|
|
||||||
|
|
||||||
|
include::hive-notes.txt[]
|
||||||
|
|
@ -18,68 +18,70 @@
|
|||||||
|
|
||||||
|
|
||||||
Connecting to a Database Server
|
Connecting to a Database Server
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
Sqoop is designed to import tables from a database into HDFS. As such,
|
Sqoop is designed to import tables from a database into HDFS. To do
|
||||||
it requires a _connect string_ that describes how to connect to the
|
so, you must specify a _connect string_ that describes how to connect to the
|
||||||
database. The _connect string_ looks like a URL, and is communicated to
|
database. The _connect string_ is similar to a URL, and is communicated to
|
||||||
Sqoop with the +--connect+ argument. This describes the server and
|
Sqoop with the +\--connect+ argument. This describes the server and
|
||||||
database to connect to; it may also specify the port. e.g.:
|
database to connect to; it may also specify the port. For example:
|
||||||
|
|
||||||
----
|
----
|
||||||
$ sqoop --connect jdbc:mysql://database.example.com/employees
|
$ sqoop import --connect jdbc:mysql://database.example.com/employees
|
||||||
----
|
----
|
||||||
|
|
||||||
This string will connect to a MySQL database named +employees+ on the
|
This string will connect to a MySQL database named +employees+ on the
|
||||||
host +database.example.com+. It's important that you *do not* use the URL
|
host +database.example.com+. It's important that you *do not* use the URL
|
||||||
+localhost+ if you intend to use Sqoop with a distributed Hadoop
|
+localhost+ if you intend to use Sqoop with a distributed Hadoop
|
||||||
cluster. The connect string you supply will be used on TaskTracker nodes
|
cluster. The connect string you supply will be used on TaskTracker nodes
|
||||||
throughout your MapReduce cluster; if they're told to connect to the
|
throughout your MapReduce cluster; if you specify the
|
||||||
literal name +localhost+, they'll each reach a different
|
literal name +localhost+, each node will connect to a different
|
||||||
database (or more likely, no database at all)! Instead, you should use
|
database (or more likely, no database at all). Instead, you should use
|
||||||
the full hostname or IP address of the database host that can be seen
|
the full hostname or IP address of the database host that can be seen
|
||||||
by all your remote nodes.
|
by all your remote nodes.
|
||||||
|
|
||||||
You may need to authenticate against the database before you can
|
You might need to authenticate against the database before you can
|
||||||
access it. The +--username+ and +--password+ or +-P+ parameters can
|
access it. You can use the +\--username+ and +\--password+ or +-P+ parameters
|
||||||
be used to supply a username and a password to the database. e.g.:
|
to supply a username and a password to the database. For example:
|
||||||
|
|
||||||
----
|
----
|
||||||
$ sqoop --connect jdbc:mysql://database.example.com/employees \
|
$ sqoop import --connect jdbc:mysql://database.example.com/employees \
|
||||||
--username aaron --password 12345
|
--username aaron --password 12345
|
||||||
----
|
----
|
||||||
|
|
||||||
.Password security
|
.Password security
|
||||||
WARNING: The +--password+ parameter is insecure, as other users may
|
WARNING: The +\--password+ parameter is insecure, as other users may
|
||||||
be able to read your password from the command-line arguments via
|
be able to read your password from the command-line arguments via
|
||||||
the output of programs such as `ps`. The *+-P+* argument will read
|
the output of programs such as `ps`. The *+-P+* argument will read
|
||||||
a password from a console prompt, and is the preferred method of
|
a password from a console prompt, and is the preferred method of
|
||||||
entering credentials. Credentials may still be transferred between
|
entering credentials. Credentials may still be transferred between
|
||||||
nodes of the MapReduce cluster using insecure means.
|
nodes of the MapReduce cluster using insecure means.
|
||||||
|
|
||||||
Sqoop automatically supports several databases, including MySQL. Connect strings beginning
|
Sqoop automatically supports several databases, including MySQL.
|
||||||
with +jdbc:mysql://+ are handled automatically Sqoop, though you may need
|
Connect strings beginning with +jdbc:mysql://+ are handled
|
||||||
to install the driver yourself. (A full list of databases with
|
automatically in Sqoop, though you may need to install the driver
|
||||||
built-in support is provided in the "Supported Databases" section, below.)
|
yourself. (A full list of databases with built-in support is provided
|
||||||
|
in the "Supported Databases" section.)
|
||||||
|
|
||||||
You can use Sqoop with any other
|
You can use Sqoop with any other
|
||||||
JDBC-compliant database as well. First, download the appropriate JDBC
|
JDBC-compliant database. First, download the appropriate JDBC
|
||||||
driver for the database you want to import from, and install the .jar
|
driver for the type of database you want to import, and install the .jar
|
||||||
file in the +/usr/hadoop/lib+ directory on all machines in your Hadoop
|
file in the +/usr/hadoop/lib+ directory on all machines in your Hadoop
|
||||||
cluster, or some other directory which is in the classpath
|
cluster, or some other directory which is in the classpath
|
||||||
on all nodes. Each driver jar also has a specific driver class which defines
|
on all nodes. Each driver +.jar+ file also has a specific driver class which defines
|
||||||
the entry-point to the driver. For example, MySQL's Connector/J library has
|
the entry-point to the driver. For example, MySQL's Connector/J library has
|
||||||
a driver class of +com.mysql.jdbc.Driver+. Refer to your database
|
a driver class of +com.mysql.jdbc.Driver+. Refer to your database
|
||||||
vendor-specific documentation to determine the main driver class.
|
vendor-specific documentation to determine the main driver class.
|
||||||
This class must be provided as an argument to Sqoop with +--driver+.
|
This class must be provided as an argument to Sqoop with +\--driver+.
|
||||||
|
|
||||||
For example, to connect to a postgres database, first download the driver from
|
For example, to connect to a SQLServer database, first download the driver from
|
||||||
link:http://jdbc.postgresql.org[http://jdbc.postgresql.org] and
|
microsoft.com and install it in your Hadoop lib path.
|
||||||
install it in your Hadoop lib path.
|
|
||||||
Then run Sqoop with something like:
|
Then run Sqoop. For example:
|
||||||
|
|
||||||
----
|
----
|
||||||
$ sqoop --connect jdbc:postgresql://postgres-server.example.com/employees \
|
$ sqoop import --driver com.microsoft.jdbc.sqlserver.SQLServerDriver \
|
||||||
--driver org.postgresql.Driver
|
--connect <connect-string> ...
|
||||||
----
|
----
|
||||||
|
|
||||||
|
|
@ -29,14 +29,14 @@ include::input-formatting-args.txt[]
|
|||||||
|
|
||||||
If you have already imported data into HDFS in a text-based
|
If you have already imported data into HDFS in a text-based
|
||||||
representation and want to change the delimiters being used, you
|
representation and want to change the delimiters being used, you
|
||||||
should regenerate the class via `sqoop --generate-only`, specifying
|
should regenerate the class via `sqoop \--generate-only`, specifying
|
||||||
the new delimiters with +--fields-terminated-by+, etc., and the old
|
the new delimiters with +\--fields-terminated-by+, etc., and the old
|
||||||
delimiters with +--input-fields-terminated-by+, etc. Then run a
|
delimiters with +\--input-fields-terminated-by+, etc. Then run a
|
||||||
MapReduce job where your mapper creates an instance of your record
|
MapReduce job where your mapper creates an instance of your record
|
||||||
class, uses its +parse()+ method to read the fields using the old
|
class, uses its +parse()+ method to read the fields using the old
|
||||||
delimiters, and emits a new +Text+ output value via the record's
|
delimiters, and emits a new +Text+ output value via the record's
|
||||||
+toString()+ method, which will use the new delimiters. You'll then
|
+toString()+ method, which will use the new delimiters. You'll then
|
||||||
want to regenerate the class another time without the
|
want to regenerate the class another time without the
|
||||||
+--input-fields-terminated-by+ specified so that the new delimiters
|
+\--input-fields-terminated-by+ specified so that the new delimiters
|
||||||
are used for both input and output.
|
are used for both input and output.
|
||||||
|
|
82
src/docs/user/create-hive-table.txt
Normal file
82
src/docs/user/create-hive-table.txt
Normal file
@ -0,0 +1,82 @@
|
|||||||
|
|
||||||
|
////
|
||||||
|
Licensed to Cloudera, Inc. under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
////
|
||||||
|
|
||||||
|
|
||||||
|
+sqoop-create-hive-table+
|
||||||
|
-------------------------
|
||||||
|
|
||||||
|
|
||||||
|
Purpose
|
||||||
|
~~~~~~~
|
||||||
|
|
||||||
|
The +create-hive-table+ tool populates a Hive metastore with a
|
||||||
|
definition for a table based on a database table previously imported
|
||||||
|
to HDFS, or one planned to be imported. This effectively performs the
|
||||||
|
"+\--hive-import+" step of +sqoop-import+ without running the
|
||||||
|
preceeding import.
|
||||||
|
|
||||||
|
If data was already loaded to HDFS, you can use this tool to finish
|
||||||
|
the pipeline of importing the data to Hive. You can also create Hive tables
|
||||||
|
with this tool; data then can be imported and populated into
|
||||||
|
the target after a preprocessing step run by the user.
|
||||||
|
|
||||||
|
Syntax
|
||||||
|
~~~~~~
|
||||||
|
|
||||||
|
----
|
||||||
|
$ sqoop create-hive-table (generic-args) (create-hive-table-args)
|
||||||
|
$ sqoop-create-hive-table (generic-args) (create-hive-table-args)
|
||||||
|
----
|
||||||
|
|
||||||
|
Although the Hadoop generic arguments must preceed any create-hive-table
|
||||||
|
arguments, the create-hive-table arguments can be entered in any order
|
||||||
|
with respect to one another.
|
||||||
|
|
||||||
|
|
||||||
|
include::common-args.txt[]
|
||||||
|
|
||||||
|
.Hive arguments:
|
||||||
|
[grid="all"]
|
||||||
|
`-----------------------------`-------------------------------------------
|
||||||
|
Argument Description
|
||||||
|
--------------------------------------------------------------------------
|
||||||
|
+\--hive-home <dir>+ Override +$HIVE_HOME+
|
||||||
|
+\--hive-overwrite+ Overwrite existing data in the Hive table.
|
||||||
|
+\--hive-table <table-name>+ Sets the table name to use when importing \
|
||||||
|
to Hive.
|
||||||
|
+\--table+ The database table to read the \
|
||||||
|
definition from.
|
||||||
|
--------------------------------------------------------------------------
|
||||||
|
|
||||||
|
include::output-args.txt[]
|
||||||
|
|
||||||
|
Do not use enclosed-by or escaped-by delimiters with output formatting
|
||||||
|
arguments used to import to Hive. Hive cannot currently parse them.
|
||||||
|
|
||||||
|
Example Invocations
|
||||||
|
~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
Define in Hive a table named +emps+ with a definition based on a
|
||||||
|
database table named +employees+:
|
||||||
|
|
||||||
|
----
|
||||||
|
$ sqoop create-hive-table --connect jdbc:mysql://db.example.com/corp \
|
||||||
|
--table employees --hive-table emps
|
||||||
|
----
|
||||||
|
|
||||||
|
|
65
src/docs/user/eval.txt
Normal file
65
src/docs/user/eval.txt
Normal file
@ -0,0 +1,65 @@
|
|||||||
|
|
||||||
|
////
|
||||||
|
Licensed to Cloudera, Inc. under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
////
|
||||||
|
|
||||||
|
|
||||||
|
+sqoop-eval+
|
||||||
|
------------
|
||||||
|
|
||||||
|
|
||||||
|
Purpose
|
||||||
|
~~~~~~~
|
||||||
|
|
||||||
|
The +eval+ tool allows users to quickly run simple SQL queries against
|
||||||
|
a database; results are printed to the console. This allows users to
|
||||||
|
preview their import queries to ensure they import the data they
|
||||||
|
expect.
|
||||||
|
|
||||||
|
Syntax
|
||||||
|
~~~~~~
|
||||||
|
|
||||||
|
----
|
||||||
|
$ sqoop eval (generic-args) (eval-args)
|
||||||
|
$ sqoop-eval (generic-args) (eval-args)
|
||||||
|
----
|
||||||
|
|
||||||
|
Although the Hadoop generic arguments must preceed any eval arguments,
|
||||||
|
the eval arguments can be entered in any order with respect to one
|
||||||
|
another.
|
||||||
|
|
||||||
|
|
||||||
|
include::common-args.txt[]
|
||||||
|
|
||||||
|
.SQL evaluation arguments:
|
||||||
|
[grid="all"]
|
||||||
|
`-----------------------------`-------------------------------------------
|
||||||
|
Argument Description
|
||||||
|
--------------------------------------------------------------------------
|
||||||
|
+-e,\--query <statement>+ Execute '+statement+' in SQL.
|
||||||
|
--------------------------------------------------------------------------
|
||||||
|
|
||||||
|
Example Invocations
|
||||||
|
~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
Select ten records from the +employees+ table:
|
||||||
|
|
||||||
|
----
|
||||||
|
$ sqoop eval --connect jdbc:mysql://db.example.com/corp \
|
||||||
|
--query "SELECT * FROM employees LIMIT 10"
|
||||||
|
----
|
||||||
|
|
||||||
|
|
153
src/docs/user/export.txt
Normal file
153
src/docs/user/export.txt
Normal file
@ -0,0 +1,153 @@
|
|||||||
|
|
||||||
|
////
|
||||||
|
Licensed to Cloudera, Inc. under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
////
|
||||||
|
|
||||||
|
|
||||||
|
+sqoop-export+
|
||||||
|
--------------
|
||||||
|
|
||||||
|
|
||||||
|
Purpose
|
||||||
|
~~~~~~~
|
||||||
|
|
||||||
|
The +export+ tool exports a set of files from HDFS back to an RDBMS.
|
||||||
|
The target table must already exist in the database. The input files
|
||||||
|
are read and parsed into a set of records according to the
|
||||||
|
user-specified delimiters. These are then transformed into a set of
|
||||||
|
+INSERT+ statements that inject the records into the database.
|
||||||
|
|
||||||
|
Syntax
|
||||||
|
~~~~~~
|
||||||
|
|
||||||
|
----
|
||||||
|
$ sqoop export (generic-args) (import-args)
|
||||||
|
$ sqoop-export (generic-args) (import-args)
|
||||||
|
----
|
||||||
|
|
||||||
|
Although the Hadoop generic arguments must preceed any export arguments,
|
||||||
|
the export arguments can be entered in any order with respect to one
|
||||||
|
another.
|
||||||
|
|
||||||
|
|
||||||
|
include::common-args.txt[]
|
||||||
|
|
||||||
|
.Export control arguments:
|
||||||
|
[grid="all"]
|
||||||
|
`-------------------------`------------------------------------------
|
||||||
|
Argument Description
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
+\--direct+ Use direct export fast path
|
||||||
|
+\--export-dir <dir>+ HDFS source path for the export
|
||||||
|
+-m,\--num-mappers <n>+ Use 'n' map tasks to export in parallel
|
||||||
|
+\--table <table-name>+ Table to populate
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
|
||||||
|
The +\--table+ and +\--export-dir+ arguments are required. These
|
||||||
|
specify the table to populate in the database, and the
|
||||||
|
directory in HDFS that contains the source data.
|
||||||
|
|
||||||
|
You can control the number of mappers independently from the number of
|
||||||
|
files present in the directory. Export performance depends on the
|
||||||
|
degree of parallelism. By default, Sqoop will use four tasks in
|
||||||
|
parallel for the export process. This may not be optimal; you will
|
||||||
|
need to experiment with your own particular setup. Additional tasks
|
||||||
|
may offer better concurrency, but if the database is already
|
||||||
|
bottlenecked on updating indices, invoking triggers, and so on, then
|
||||||
|
additional load may decrease performance. The +\--num-mappers+ or +-m+
|
||||||
|
arguments control the number of map tasks, which is the degree of
|
||||||
|
parallelism used.
|
||||||
|
|
||||||
|
MySQL provides a direct mode for exports as well, using the
|
||||||
|
+mysqlimport+ tool. When exporting to MySQL, use the +\--direct+ argument
|
||||||
|
to specify this codepath. This may be
|
||||||
|
higher-performance than the standard JDBC codepath.
|
||||||
|
|
||||||
|
include::input-args.txt[]
|
||||||
|
|
||||||
|
include::output-args.txt[]
|
||||||
|
|
||||||
|
Sqoop automatically generates code to parse and interpret records of the
|
||||||
|
files containing the data to be exported back to the database. If
|
||||||
|
these files were created with non-default delimiters (comma-separated
|
||||||
|
fields with newline-separated records), you should specify
|
||||||
|
the same delimiters again so that Sqoop can parse your files.
|
||||||
|
|
||||||
|
If you specify incorrect delimiters, Sqoop will fail to find enough
|
||||||
|
columns per line. This will cause export map tasks to fail by throwing
|
||||||
|
+ParseExceptions+.
|
||||||
|
|
||||||
|
include::codegen-args.txt[]
|
||||||
|
|
||||||
|
If the records to be exported were generated as the result of a
|
||||||
|
previous import, then the original generated class can be used to read
|
||||||
|
the data back. Specifying +\--jar-file+ and +\--class-name+ obviate
|
||||||
|
the need to specify delimiters in this case.
|
||||||
|
|
||||||
|
Exports and Transactions
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
Exports are performed by multiple writers in parallel. Each writer
|
||||||
|
uses a separate connection to the database; these have separate
|
||||||
|
transactions from one another. Sqoop uses the multi-row +INSERT+
|
||||||
|
syntax to insert up to 100 records per statement. Every 100
|
||||||
|
statements, the current transaction within a writer task is committed,
|
||||||
|
causing a commit every 10,000 rows. This ensures that transaction
|
||||||
|
buffers do not grow without bound, and cause out-of-memory conditions.
|
||||||
|
Therefore, an export is not an atomic process. Partial results from
|
||||||
|
the export will become visible before the export is complete.
|
||||||
|
|
||||||
|
Failed Exports
|
||||||
|
~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
Exports may fail for a number of reasons:
|
||||||
|
|
||||||
|
- Loss of connectivity from the Hadoop cluster to the database (either
|
||||||
|
due to hardware fault, or server software crashes)
|
||||||
|
- Attempting to +INSERT+ a row which violates a consistency constraint
|
||||||
|
(for example, inserting a duplicate primary key value)
|
||||||
|
- Attempting to parse an incomplete or malformed record from the HDFS
|
||||||
|
source data
|
||||||
|
- Attempting to parse records using incorrect delimiters
|
||||||
|
- Capacity issues (such as insufficient RAM or disk space)
|
||||||
|
|
||||||
|
If an export map task fails due to these or other reasons, it will
|
||||||
|
cause the export job to fail. The results of a failed export are
|
||||||
|
undefined. Each export map task operates in a separate transaction.
|
||||||
|
Furthermore, individual map tasks +commit+ their current transaction
|
||||||
|
periodically. If a task fails, the current transaction will be rolled
|
||||||
|
back. Any previously-committed transactions will remain durable in the
|
||||||
|
database, leading to a partially-complete export.
|
||||||
|
|
||||||
|
Example Invocations
|
||||||
|
~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
A basic export to populate a table named +bar+:
|
||||||
|
|
||||||
|
----
|
||||||
|
$ sqoop export --connect jdbc:mysql://db.example.com/foo --table bar \
|
||||||
|
--export-dir /results/bar_data
|
||||||
|
----
|
||||||
|
|
||||||
|
This example takes the files in +/results/bar_data+ and injects their
|
||||||
|
contents in to the +bar+ table in the +foo+ database on +db.example.com+.
|
||||||
|
The target table must already exist in the database. Sqoop performs
|
||||||
|
a set of +INSERT INTO+ operations, without regard for existing content. If
|
||||||
|
Sqoop attempts to insert rows which violate constraints in the database
|
||||||
|
(for example, a particular primary key value already exists), then the export
|
||||||
|
fails.
|
||||||
|
|
||||||
|
|
82
src/docs/user/help.txt
Normal file
82
src/docs/user/help.txt
Normal file
@ -0,0 +1,82 @@
|
|||||||
|
|
||||||
|
////
|
||||||
|
Licensed to Cloudera, Inc. under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
////
|
||||||
|
|
||||||
|
|
||||||
|
+sqoop-help+
|
||||||
|
------------
|
||||||
|
|
||||||
|
Purpose
|
||||||
|
~~~~~~~
|
||||||
|
|
||||||
|
List tools available in Sqoop and explain their usage.
|
||||||
|
|
||||||
|
Syntax
|
||||||
|
~~~~~~
|
||||||
|
|
||||||
|
----
|
||||||
|
$ sqoop help [tool-name]
|
||||||
|
$ sqoop-help [tool-name]
|
||||||
|
----
|
||||||
|
|
||||||
|
If no tool name is provided (for example, the user runs +sqoop help+), then
|
||||||
|
the available tools are listed. With a tool name, the usage
|
||||||
|
instructions for that specific tool are presented on the console.
|
||||||
|
|
||||||
|
Example Invocations
|
||||||
|
~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
List available tools:
|
||||||
|
|
||||||
|
----
|
||||||
|
$ sqoop help
|
||||||
|
usage: sqoop COMMAND [ARGS]
|
||||||
|
|
||||||
|
Available commands:
|
||||||
|
codegen Generate code to interact with database records
|
||||||
|
create-hive-table Import a table definition into Hive
|
||||||
|
eval Evaluate a SQL statement and display the results
|
||||||
|
export Export an HDFS directory to a database table
|
||||||
|
|
||||||
|
...
|
||||||
|
|
||||||
|
See 'sqoop help COMMAND' for information on a specific command.
|
||||||
|
----
|
||||||
|
|
||||||
|
Display usage instructions for the +import+ tool:
|
||||||
|
|
||||||
|
----
|
||||||
|
$ bin/sqoop help import
|
||||||
|
usage: sqoop import [GENERIC-ARGS] [TOOL-ARGS]
|
||||||
|
|
||||||
|
Common arguments:
|
||||||
|
--connect <jdbc-uri> Specify JDBC connect string
|
||||||
|
--driver <class-name> Manually specify JDBC driver class to use
|
||||||
|
--hadoop-home <dir> Override $HADOOP_HOME
|
||||||
|
--help Print usage instructions
|
||||||
|
-P Read password from console
|
||||||
|
--password <password> Set authentication password
|
||||||
|
--username <username> Set authentication username
|
||||||
|
--verbose Print more information while working
|
||||||
|
|
||||||
|
Import control arguments:
|
||||||
|
--as-sequencefile Imports data to SequenceFiles
|
||||||
|
--as-textfile Imports data as plain text (default)
|
||||||
|
...
|
||||||
|
----
|
||||||
|
|
||||||
|
|
32
src/docs/user/hive-args.txt
Normal file
32
src/docs/user/hive-args.txt
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
|
||||||
|
////
|
||||||
|
Licensed to Cloudera, Inc. under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
////
|
||||||
|
|
||||||
|
|
||||||
|
.Hive arguments:
|
||||||
|
[grid="all"]
|
||||||
|
`-----------------------------`-------------------------------------------
|
||||||
|
Argument Description
|
||||||
|
--------------------------------------------------------------------------
|
||||||
|
+\--hive-home <dir>+ Override +$HIVE_HOME+
|
||||||
|
+\--hive-import+ Import tables into Hive (Uses Hive's \
|
||||||
|
default delimiters if none are set.)
|
||||||
|
+\--hive-overwrite+ Overwrite existing data in the Hive table.
|
||||||
|
+\--hive-table <table-name>+ Sets the table name to use when importing\
|
||||||
|
to Hive.
|
||||||
|
--------------------------------------------------------------------------
|
||||||
|
|
30
src/docs/user/hive-notes.txt
Normal file
30
src/docs/user/hive-notes.txt
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
|
||||||
|
////
|
||||||
|
Licensed to Cloudera, Inc. under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
////
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Schema Definition in Hive
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
Hive users will note that there is not a one-to-one mapping between
|
||||||
|
SQL types and Hive types. In general, SQL types that do not have a
|
||||||
|
direct mapping (for example, +DATE+, +TIME+, and +TIMESTAMP+) will be coerced to
|
||||||
|
+STRING+ in Hive. The +NUMERIC+ and +DECIMAL+ SQL types will be coerced to
|
||||||
|
+DOUBLE+. In these cases, Sqoop will emit a warning in its log messages
|
||||||
|
informing you of the loss of precision.
|
||||||
|
|
59
src/docs/user/hive.txt
Normal file
59
src/docs/user/hive.txt
Normal file
@ -0,0 +1,59 @@
|
|||||||
|
|
||||||
|
////
|
||||||
|
Licensed to Cloudera, Inc. under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
////
|
||||||
|
|
||||||
|
|
||||||
|
Importing Data Into Hive
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
Sqoop's import tool's main function is to upload your data into files
|
||||||
|
in HDFS. If you have a Hive metastore associated with your HDFS
|
||||||
|
cluster, Sqoop can also import the data into Hive by generating and
|
||||||
|
executing a +CREATE TABLE+ statement to define the data's layout in
|
||||||
|
Hive. Importing data into Hive is as simple as adding the
|
||||||
|
*+\--hive-import+* option to your Sqoop command line.
|
||||||
|
|
||||||
|
If the Hive table already exists, you can specify the
|
||||||
|
*+\--hive-overwrite+* option to indicate that existing table in hive must
|
||||||
|
be replaced. After your data is imported into HDFS or this step is
|
||||||
|
omitted, Sqoop will generate a Hive script containing a +CREATE TABLE+
|
||||||
|
operation defining your columns using Hive's types, and a +LOAD DATA INPATH+
|
||||||
|
statement to move the data files into Hive's warehouse directory.
|
||||||
|
|
||||||
|
The script will be executed by calling
|
||||||
|
the installed copy of hive on the machine where Sqoop is run. If you have
|
||||||
|
multiple Hive installations, or +hive+ is not in your +$PATH+, use the
|
||||||
|
*+\--hive-home+* option to identify the Hive installation directory.
|
||||||
|
Sqoop will use +$HIVE_HOME/bin/hive+ from here.
|
||||||
|
|
||||||
|
NOTE: This function is incompatible with +\--as-sequencefile+.
|
||||||
|
|
||||||
|
Hive's text parser does not support escaping or enclosing
|
||||||
|
characters. Sqoop will print a warning if you use +\--escaped-by+,
|
||||||
|
+\--enclosed-by+, or +\--optionally-enclosed-by+ because Hive does not know
|
||||||
|
how to parse these. It will pass the field and record delimiters through
|
||||||
|
to Hive. If you do not set any delimiters and do use +\--hive-import+,
|
||||||
|
the field delimiter will be set to +^A+ and the record delimiter will
|
||||||
|
be set to +\n+ to be consistent with Hive's defaults. It is important when
|
||||||
|
importing data to Hive to choose unambiguous field and record delimiters
|
||||||
|
due to the lack of escape and enclosing characters.
|
||||||
|
|
||||||
|
The table name used in Hive is, by default, the same as that of the
|
||||||
|
source table. You can control the output table name with the +\--hive-table+
|
||||||
|
option.
|
||||||
|
|
||||||
|
|
112
src/docs/user/import-all-tables.txt
Normal file
112
src/docs/user/import-all-tables.txt
Normal file
@ -0,0 +1,112 @@
|
|||||||
|
|
||||||
|
////
|
||||||
|
Licensed to Cloudera, Inc. under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
////
|
||||||
|
|
||||||
|
|
||||||
|
+sqoop-import-all-tables+
|
||||||
|
-------------------------
|
||||||
|
|
||||||
|
Purpose
|
||||||
|
~~~~~~~
|
||||||
|
|
||||||
|
The +import-all-tables+ tool imports a set of tables from an RDBMS to HDFS.
|
||||||
|
Data from each table is stored in a separate directory in HDFS.
|
||||||
|
|
||||||
|
For the +import-all-tables+ tool to be useful, the following conditions
|
||||||
|
must be met:
|
||||||
|
|
||||||
|
- Each table must have a single-column primary key.
|
||||||
|
- You must intend to import all columns of each table.
|
||||||
|
- You must not intend to use non-default splitting column, nor impose
|
||||||
|
any conditions via a +WHERE+ clause.
|
||||||
|
|
||||||
|
Syntax
|
||||||
|
~~~~~~
|
||||||
|
|
||||||
|
----
|
||||||
|
$ sqoop import-all-tables (generic-args) (import-args)
|
||||||
|
$ sqoop-import-all-tables (generic-args) (import-args)
|
||||||
|
----
|
||||||
|
|
||||||
|
Although the Hadoop generic arguments must preceed any import arguments,
|
||||||
|
the import arguments can be entered in any order with respect to one
|
||||||
|
another.
|
||||||
|
|
||||||
|
include::common-args.txt[]
|
||||||
|
|
||||||
|
.Import control arguments:
|
||||||
|
[grid="all"]
|
||||||
|
`----------------------------`---------------------------------------
|
||||||
|
Argument Description
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
+\--as-sequencefile+ Imports data to SequenceFiles
|
||||||
|
+\--as-textfile+ Imports data as plain text (default)
|
||||||
|
+\--direct+ Use direct import fast path
|
||||||
|
+\--direct-split-size <n>+ Split the input stream every 'n' bytes when\
|
||||||
|
importing in direct mode
|
||||||
|
+\--inline-lob-limit <n>+ Set the maximum size for an inline LOB
|
||||||
|
+-m,\--num-mappers <n>+ Use 'n' map tasks to import in parallel
|
||||||
|
+\--warehouse-dir <dir>+ HDFS parent for table destination
|
||||||
|
+-z,\--compress+ Enable compression
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
|
||||||
|
These arguments behave in the same manner as they do when used for the
|
||||||
|
+sqoop-import+ tool, but the +\--table+, +\--split-by+, +\--columns+,
|
||||||
|
and +\--where+ arguments are invalid for +sqoop-import-all-tables+.
|
||||||
|
|
||||||
|
include::output-args.txt[]
|
||||||
|
|
||||||
|
include::input-args.txt[]
|
||||||
|
|
||||||
|
include::hive-args.txt[]
|
||||||
|
|
||||||
|
.Code generation arguments:
|
||||||
|
[grid="all"]
|
||||||
|
`------------------------`-----------------------------------------------
|
||||||
|
Argument Description
|
||||||
|
-------------------------------------------------------------------------
|
||||||
|
+\--bindir <dir>+ Output directory for compiled objects
|
||||||
|
+\--jar-file <file>+ Disable code generation; use specified jar
|
||||||
|
+\--outdir <dir>+ Output directory for generated code
|
||||||
|
+\--package-name <name>+ Put auto-generated classes in this package
|
||||||
|
-------------------------------------------------------------------------
|
||||||
|
|
||||||
|
The +import-all-tables+ tool does not support the +\--class-name+ argument.
|
||||||
|
You may, however, specify a package with +\--package-name+ in which all
|
||||||
|
generated classes will be placed.
|
||||||
|
|
||||||
|
Example Invocations
|
||||||
|
~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
Import all tables from the +corp+ database:
|
||||||
|
|
||||||
|
----
|
||||||
|
$ sqoop import-all-tables --connect jdbc:mysql://db.foo.com/corp
|
||||||
|
----
|
||||||
|
|
||||||
|
Verifying that it worked:
|
||||||
|
|
||||||
|
----
|
||||||
|
$ hadoop fs -ls
|
||||||
|
Found 4 items
|
||||||
|
drwxr-xr-x - someuser somegrp 0 2010-04-27 17:15 /user/someuser/EMPLOYEES
|
||||||
|
drwxr-xr-x - someuser somegrp 0 2010-04-27 17:15 /user/someuser/PAYCHECKS
|
||||||
|
drwxr-xr-x - someuser somegrp 0 2010-04-27 17:15 /user/someuser/DEPARTMENTS
|
||||||
|
drwxr-xr-x - someuser somegrp 0 2010-04-27 17:15 /user/someuser/OFFICE_SUPPLIES
|
||||||
|
----
|
||||||
|
|
||||||
|
|
500
src/docs/user/import.txt
Normal file
500
src/docs/user/import.txt
Normal file
@ -0,0 +1,500 @@
|
|||||||
|
|
||||||
|
////
|
||||||
|
Licensed to Cloudera, Inc. under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
////
|
||||||
|
|
||||||
|
|
||||||
|
+sqoop-import+
|
||||||
|
--------------
|
||||||
|
|
||||||
|
Purpose
|
||||||
|
~~~~~~~
|
||||||
|
|
||||||
|
The +import+ tool imports an individual table from an RDBMS to HDFS.
|
||||||
|
Each row from a table is represented as a separate record in HDFS.
|
||||||
|
Records can be stored as text files (one record per line), or in
|
||||||
|
binary representation in SequenceFiles.
|
||||||
|
|
||||||
|
Syntax
|
||||||
|
~~~~~~
|
||||||
|
|
||||||
|
----
|
||||||
|
$ sqoop import (generic-args) (import-args)
|
||||||
|
$ sqoop-import (generic-args) (import-args)
|
||||||
|
----
|
||||||
|
|
||||||
|
While the Hadoop generic arguments must preceed any import arguments,
|
||||||
|
you can type the import arguments in any order with respect to one
|
||||||
|
another.
|
||||||
|
|
||||||
|
NOTE: In this document, arguments are grouped into collections
|
||||||
|
organized by function. Some collections are present in several tools
|
||||||
|
(for example, the "common" arguments). An extended description of their
|
||||||
|
functionality is given only on the first presentation in this
|
||||||
|
document.
|
||||||
|
|
||||||
|
include::common-args.txt[]
|
||||||
|
|
||||||
|
include::connecting.txt[]
|
||||||
|
|
||||||
|
.Import control arguments:
|
||||||
|
[grid="all"]
|
||||||
|
`-----------------------------`--------------------------------------
|
||||||
|
Argument Description
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
+\--as-sequencefile+ Imports data to SequenceFiles
|
||||||
|
+\--as-textfile+ Imports data as plain text (default)
|
||||||
|
+\--columns <col,col,col...>+ Columns to import from table
|
||||||
|
+\--direct+ Use direct import fast path
|
||||||
|
+\--direct-split-size <n>+ Split the input stream every 'n' bytes\
|
||||||
|
when importing in direct mode
|
||||||
|
+\--inline-lob-limit <n>+ Set the maximum size for an inline LOB
|
||||||
|
+-m,\--num-mappers <n>+ Use 'n' map tasks to import in parallel
|
||||||
|
+\--split-by <column-name>+ Column of the table used to split work\
|
||||||
|
units
|
||||||
|
+\--table <table-name>+ Table to read
|
||||||
|
+\--warehouse-dir <dir>+ HDFS parent for table destination
|
||||||
|
+\--where <where clause>+ WHERE clause to use during import
|
||||||
|
+-z,\--compress+ Enable compression
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Selecting the Data to Import
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
Sqoop currently imports data in a table-centric fashion. Use the
|
||||||
|
+\--table+ argument to select the table to import. For example, +\--table
|
||||||
|
employees+. This argument can also identify a +VIEW+ or other table-like
|
||||||
|
entity in a database.
|
||||||
|
|
||||||
|
By default, all columns within a table are selected for import.
|
||||||
|
Imported data is written to HDFS in its "natural order;" that is, a
|
||||||
|
table containing columns A, B, and C result in an import of data such
|
||||||
|
as:
|
||||||
|
|
||||||
|
----
|
||||||
|
A1,B1,C1
|
||||||
|
A2,B2,C2
|
||||||
|
...
|
||||||
|
----
|
||||||
|
|
||||||
|
You can select a subset of columns and control their ordering by using
|
||||||
|
the +\--columns+ argument. This should include a comma-delimited list
|
||||||
|
of columns to import. For example: +\--columns "name,employee_id,jobtitle"+.
|
||||||
|
|
||||||
|
You can control which rows are imported by adding a SQL +WHERE+ clause
|
||||||
|
to the import statement. By default, Sqoop generates statements of the
|
||||||
|
form +SELECT <column list> FROM <table name>+. You can append a
|
||||||
|
+WHERE+ clause to this with the +\--where+ argument. For example: +\--where
|
||||||
|
"id > 400"+. Only rows where the +id+ column has a value greater than
|
||||||
|
400 will be imported.
|
||||||
|
|
||||||
|
Controlling Parallelism
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
Sqoop imports data in parallel from most database sources. You can
|
||||||
|
specify the number
|
||||||
|
of map tasks (parallel processes) to use to perform the import by
|
||||||
|
using the +-m+ or +\--num-mappers+ argument. Each of these arguments
|
||||||
|
takes an integer value which corresponds to the degree of parallelism
|
||||||
|
to employ. By default, four tasks are used. Some databases may see
|
||||||
|
improved performance by increasing this value to 8 or 16. Do not
|
||||||
|
increase the degree of parallelism greater than that available within
|
||||||
|
your MapReduce cluster; tasks will run serially and will likely
|
||||||
|
increase the amount of time required to perform the import. Likewise,
|
||||||
|
do not increase the degree of parallism higher than that which your
|
||||||
|
database can reasonably support. Connecting 100 concurrent clients to
|
||||||
|
your database may increase the load on the database server to a point
|
||||||
|
where performance suffers as a result.
|
||||||
|
|
||||||
|
When performing parallel imports, Sqoop needs a criterion by which it
|
||||||
|
can split the workload. Sqoop uses a _splitting column_ to split the
|
||||||
|
workload. By default, Sqoop will identify the primary key column (if
|
||||||
|
present) in a table and use it as the splitting column. The low and
|
||||||
|
high values for the splitting column are retrieved from the database,
|
||||||
|
and the map tasks operate on evenly-sized components of the total
|
||||||
|
range. For example, if you had a table with a primary key column of
|
||||||
|
+id+ whose minimum value was 0 and maximum value was 1000, and Sqoop
|
||||||
|
was directed to use 4 tasks, Sqoop would run four processes which each
|
||||||
|
execute SQL statements of the form +SELECT * FROM sometable WHERE id
|
||||||
|
>= lo AND id < hi+, with +(lo, hi)+ set to (0, 250), (250, 500),
|
||||||
|
(500, 750), and (750, 1001) in the different tasks.
|
||||||
|
|
||||||
|
If the actual values for the primary key are not uniformly distributed
|
||||||
|
across its range, then this can result in unbalanced tasks. You should
|
||||||
|
explicitly choose a different column with the +\--split-by+ argument.
|
||||||
|
For example, +\--split-by employee_id+. Sqoop cannot currently split on
|
||||||
|
multi-column indices. If your table has no index column, or has a
|
||||||
|
multi-column key, then you must also manually choose a splitting
|
||||||
|
column.
|
||||||
|
|
||||||
|
Controlling the Import Process
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
By default, the import process will use JDBC which provides a
|
||||||
|
reasonable cross-vendor import channel. Some databases can perform
|
||||||
|
imports in a more high-performance fashion by using database-specific
|
||||||
|
data movement tools. For example, MySQL provides the +mysqldump+ tool
|
||||||
|
which can export data from MySQL to other systems very quickly. By
|
||||||
|
supplying the +\--direct+ argument, you are specifying that Sqoop
|
||||||
|
should attempt the direct import channel. This channel may be
|
||||||
|
higher performance than using JDBC. Currently, direct mode does not
|
||||||
|
support imports of large object columns.
|
||||||
|
|
||||||
|
When importing from PostgreSQL in conjunction with direct mode, you
|
||||||
|
can split the import into separate files after
|
||||||
|
individual files reach a certain size. This size limit is controlled
|
||||||
|
with the +\--direct-split-size+ argument.
|
||||||
|
|
||||||
|
By default, Sqoop will import a table named +foo+ to a directory named
|
||||||
|
+foo+ inside your home directory in HDFS. For example, if your
|
||||||
|
username is +someuser+, then the import tool will write to
|
||||||
|
+/user/someuser/foo/(files)+. You can adjust the parent directory of
|
||||||
|
the import with the +\--warehouse-dir+ argument. For example:
|
||||||
|
|
||||||
|
----
|
||||||
|
$ sqoop import --connnect <connect-str> --table foo --warehouse-dir /shared \
|
||||||
|
...
|
||||||
|
----
|
||||||
|
|
||||||
|
This command would write to a set of files in the +/shared/foo/+ directory.
|
||||||
|
|
||||||
|
When using direct mode, you can specify additional arguments which
|
||||||
|
should be passed to the underlying tool. If the argument
|
||||||
|
+\--+ is given on the command-line, then subsequent arguments are sent
|
||||||
|
directly to the underlying tool. For example, the following adjusts
|
||||||
|
the character set used by +mysqldump+:
|
||||||
|
|
||||||
|
----
|
||||||
|
$ sqoop import --connect jdbc:mysql://server.foo.com/db --table bar \
|
||||||
|
--direct -- --default-character-set=latin1
|
||||||
|
----
|
||||||
|
|
||||||
|
File Formats
|
||||||
|
^^^^^^^^^^^^
|
||||||
|
|
||||||
|
You can import data in one of two file formats: delimited text or
|
||||||
|
SequenceFiles.
|
||||||
|
|
||||||
|
Delimited text is the default import format. You can also specify it
|
||||||
|
explicitly by using the +\--as-textfile+ argument. This argument will write
|
||||||
|
string-based representations of each record to the output files, with
|
||||||
|
delimiter characters between individual columns and rows. These
|
||||||
|
delimiters may be commas, tabs, or other characters. (The delimiters
|
||||||
|
can be selected; see "Output line formatting arguments.") The
|
||||||
|
following is the results of an example text-based import:
|
||||||
|
|
||||||
|
----
|
||||||
|
1,here is a message,2010-05-01
|
||||||
|
2,happy new year!,2010-01-01
|
||||||
|
3,another message,2009-11-12
|
||||||
|
----
|
||||||
|
|
||||||
|
Delimited text is appropriate for most non-binary data types. It also
|
||||||
|
readily supports further manipulation by other tools, such as Hive.
|
||||||
|
|
||||||
|
SequenceFiles are a binary format that store individual records in
|
||||||
|
custom record-specific data types. These data types are manifested as
|
||||||
|
Java classes. Sqoop will automatically generate these data types for
|
||||||
|
you. This format supports exact storage of all data in binary
|
||||||
|
representations, and is appropriate for storing binary data
|
||||||
|
(for example, +VARBINARY+ columns), or data that will be principly
|
||||||
|
manipulated by custom MapReduce programs (reading from SequenceFiles
|
||||||
|
is higher-performance than reading from text files, as records do not
|
||||||
|
need to be parsed).
|
||||||
|
|
||||||
|
By default, data is not compressed. You can compress
|
||||||
|
your data by using the deflate (gzip) algorithm with the +-z+ or
|
||||||
|
+\--compress+ argument. This applies to both SequenceFiles or text
|
||||||
|
files.
|
||||||
|
|
||||||
|
Large Objects
|
||||||
|
^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
Sqoop handles large objects (+BLOB+ and +CLOB+ columns) in particular
|
||||||
|
ways. If this data is truly large, then these columns should not be
|
||||||
|
fully materialized in memory for manipulation, as most columns are.
|
||||||
|
Instead, their data is handled in a streaming fashion. Large objects
|
||||||
|
can be stored inline with the rest of the data, in which case they are
|
||||||
|
fully materialized in memory on every access, or they can be stored in
|
||||||
|
a secondary storage file linked to the primary data storage. By
|
||||||
|
default, large objects less than 16 MB in size are stored inline with
|
||||||
|
the rest of the data. At a larger size, they are stored in files in
|
||||||
|
the +_lobs+ subdirectory of the import target directory. These files
|
||||||
|
are stored in a separate format optimized for large record storage,
|
||||||
|
which can accomodate records of up to 2^63 bytes each. The size at
|
||||||
|
which lobs spill into separate files is controlled by the
|
||||||
|
+\--inline-lob-limit+ argument, which takes a parameter specifying the
|
||||||
|
largest lob size to keep inline, in bytes. If you set the inline LOB
|
||||||
|
limit to 0, all large objects will be placed in external
|
||||||
|
storage.
|
||||||
|
|
||||||
|
include::output-args.txt[]
|
||||||
|
|
||||||
|
When importing to delimited files, the choice of delimiter is
|
||||||
|
important. Delimiters which appear inside string-based fields may
|
||||||
|
cause ambiguous parsing of the imported data by subsequent analysis
|
||||||
|
passes. For example, the string +"Hello, pleased to meet you"+ should
|
||||||
|
not be imported with the end-of-field delimiter set to a comma.
|
||||||
|
|
||||||
|
Delimiters may be specified as:
|
||||||
|
|
||||||
|
- a character (+\--fields-terminated-by X+)
|
||||||
|
- an escape character (+\--fields-terminated-by \t+). Supported escape
|
||||||
|
characters are:
|
||||||
|
* +\b+ (backspace)
|
||||||
|
* +\n+ (newline)
|
||||||
|
* +\r+ (carriage return)
|
||||||
|
* +\t+ (tab)
|
||||||
|
* +\"+ (double-quote)
|
||||||
|
* +\\'+ (single-quote)
|
||||||
|
* +\\+ (backslash)
|
||||||
|
* +\0+ (NUL) - This will insert NUL characters between fields or lines,
|
||||||
|
or will disable enclosing/escaping if used for one of the +\--enclosed-by+,
|
||||||
|
+\--optionally-enclosed-by+, or +\--escaped-by+ arguments.
|
||||||
|
- The octal representation of a A UTF-8 character's code point. This
|
||||||
|
should be of the form +\0ooo+, where _ooo_ is the octal value.
|
||||||
|
For example, +\--fields-terminated-by \001+ would yield the +^A+ character.
|
||||||
|
- The hexadecimal representation of a A UTF-8 character's code point. This
|
||||||
|
should be of the form +\0xhhh+, where _hhh_ is the hex value.
|
||||||
|
For example, +\--fields-terminated-by \0x10+ would yield the carriage
|
||||||
|
return character.
|
||||||
|
|
||||||
|
The default delimiters are a comma (+,+) for fields, a newline (+\n+) for records, no quote
|
||||||
|
character, and no escape character. Note that this can lead to
|
||||||
|
ambiguous/unparsible records if you import database records containing
|
||||||
|
commas or newlines in the field data. For unambiguous parsing, both must
|
||||||
|
be enabled. For example, via +\--mysql-delimiters+.
|
||||||
|
|
||||||
|
If unambiguous delimiters cannot be presented, then use _enclosing_ and
|
||||||
|
_escaping_ characters. The combination of (optional)
|
||||||
|
enclosing and escaping characters will allow unambiguous parsing of
|
||||||
|
lines. For example, suppose one column of a dataset contained the
|
||||||
|
following values:
|
||||||
|
|
||||||
|
----
|
||||||
|
Some string, with a comma.
|
||||||
|
Another "string with quotes"
|
||||||
|
----
|
||||||
|
|
||||||
|
The following arguments would provide delimiters which can be
|
||||||
|
unambiguously parsed:
|
||||||
|
|
||||||
|
----
|
||||||
|
$ sqoop import --fields-terminated-by , --escaped-by \\ --enclosed-by '\"' ...
|
||||||
|
----
|
||||||
|
|
||||||
|
(Note that to prevent the shell from mangling the enclosing character,
|
||||||
|
we have enclosed that argument itself in single-quotes.)
|
||||||
|
|
||||||
|
The result of the above arguments applied to the above dataset would
|
||||||
|
be:
|
||||||
|
|
||||||
|
----
|
||||||
|
"Some string, with a comma.","1","2","3"...
|
||||||
|
"Another \"string with quotes\"","4","5","6"...
|
||||||
|
----
|
||||||
|
|
||||||
|
Here the imported strings are shown in the context of additional
|
||||||
|
columns (+"1","2","3"+, etc.) to demonstrate the full effect of enclosing
|
||||||
|
and escaping. The enclosing character is only strictly necessary when
|
||||||
|
delimiter characters appear in the imported text. The enclosing
|
||||||
|
character can therefore be specified as optional:
|
||||||
|
|
||||||
|
----
|
||||||
|
$ sqoop import --optionally-enclosed-by '\"' (the rest as above)...
|
||||||
|
----
|
||||||
|
|
||||||
|
Which would result in the following import:
|
||||||
|
|
||||||
|
----
|
||||||
|
"Some string, with a comma.",1,2,3...
|
||||||
|
"Another \"string with quotes\"",4,5,6...
|
||||||
|
----
|
||||||
|
|
||||||
|
NOTE: Hive does not support enclosing and escaping characters. You
|
||||||
|
must choose unambiguous field and record-terminating delimiters
|
||||||
|
without the help of escaping and enclosing characters when
|
||||||
|
working with Hive; this is a limitation of Hive's input parsing
|
||||||
|
abilities.
|
||||||
|
|
||||||
|
The +\--mysql-delimiters+ argument is a shorthand argument which uses
|
||||||
|
the default delimiters for the +mysqldump+ program.
|
||||||
|
If you use the +mysqldump+ delimiters in conjunction with a
|
||||||
|
direct-mode import (with +\--direct+), very fast imports can be
|
||||||
|
achieved.
|
||||||
|
|
||||||
|
While the choice of delimiters is most important for a text-mode
|
||||||
|
import, it is still relevant if you import to SequenceFiles with
|
||||||
|
+\--as-sequencefile+. The generated class' +toString()+ method
|
||||||
|
will use the delimiters you specify, so subsequent formatting of
|
||||||
|
the output data will rely on the delimiters you choose.
|
||||||
|
|
||||||
|
include::input-args.txt[]
|
||||||
|
|
||||||
|
When Sqoop imports data to HDFS, it generates a Java class which can
|
||||||
|
reinterpret the text files that it creates when doing a
|
||||||
|
delimited-format import. The delimiters are chosen with arguments such
|
||||||
|
as +\--fields-terminated-by+; this controls both how the data is
|
||||||
|
written to disk, and how the generated +parse()+ method reinterprets
|
||||||
|
this data. The delimiters used by the +parse()+ method can be chosen
|
||||||
|
independently of the output arguments, by using
|
||||||
|
+\--input-fields-terminated-by+, and so on. This is useful, for example, to
|
||||||
|
generate classes which can parse records created with one set of
|
||||||
|
delimiters, and emit the records to a different set of files using a
|
||||||
|
separate set of delimiters.
|
||||||
|
|
||||||
|
include::hive-args.txt[]
|
||||||
|
|
||||||
|
include::hive.txt[]
|
||||||
|
|
||||||
|
include::codegen-args.txt[]
|
||||||
|
|
||||||
|
As mentioned earlier, a byproduct of importing a table to HDFS is a
|
||||||
|
class which can manipulate the imported data. If the data is stored in
|
||||||
|
SequenceFiles, this class will be used for the data's serialization
|
||||||
|
container. Therefore, you should use this class in your subsequent
|
||||||
|
MapReduce processing of the data.
|
||||||
|
|
||||||
|
The class is typically named after the table; a table named +foo+ will
|
||||||
|
generate a class named +foo+. You may want to override this class
|
||||||
|
name. For example, if your table is named +EMPLOYEES+, you may want to
|
||||||
|
specify +\--class-name Employee+ instead. Similarly, you can specify
|
||||||
|
just the package name with +\--package-name+. The following import
|
||||||
|
generates a class named +com.foocorp.SomeTable+:
|
||||||
|
|
||||||
|
----
|
||||||
|
$ sqoop import --connect <connect-str> --table SomeTable --package-name com.foocorp
|
||||||
|
----
|
||||||
|
|
||||||
|
The +.java+ source file for your class will be written to the current
|
||||||
|
working directory when you run +sqoop+. You can control the output
|
||||||
|
directory with +\--outdir+. For example, +\--outdir src/generated/+.
|
||||||
|
|
||||||
|
The import process compiles the source into +.class+ and +.jar+ files;
|
||||||
|
these are ordinarily stored under +/tmp+. You can select an alternate
|
||||||
|
target directory with +\--bindir+. For example, +\--bindir /scratch+.
|
||||||
|
|
||||||
|
If you already have a compiled class that can be used to perform the
|
||||||
|
import and want to suppress the code-generation aspect of the import
|
||||||
|
process, you can use an existing jar and class by
|
||||||
|
providing the +\--jar-file+ and +\--class-name+ options. For example:
|
||||||
|
|
||||||
|
----
|
||||||
|
$ sqoop import --table SomeTable --jar-file mydatatypes.jar \
|
||||||
|
--class-name SomeTableType
|
||||||
|
----
|
||||||
|
|
||||||
|
This command will load the +SomeTableType+ class out of +mydatatypes.jar+.
|
||||||
|
|
||||||
|
|
||||||
|
Example Invocations
|
||||||
|
~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
The following examples illustrate how to use the import tool in a variety
|
||||||
|
of situations.
|
||||||
|
|
||||||
|
A basic import of a table named +EMPLOYEES+ in the +corp+ database:
|
||||||
|
|
||||||
|
----
|
||||||
|
$ sqoop import --connect jdbc:mysql://db.foo.com/corp --table EMPLOYEES
|
||||||
|
----
|
||||||
|
|
||||||
|
A basic import requiring a login:
|
||||||
|
|
||||||
|
----
|
||||||
|
$ sqoop import --connect jdbc:mysql://db.foo.com/corp --table EMPLOYEES \
|
||||||
|
--username SomeUser -P
|
||||||
|
Enter password: (hidden)
|
||||||
|
----
|
||||||
|
|
||||||
|
Selecting specific columns from the +EMPLOYEES+ table:
|
||||||
|
|
||||||
|
----
|
||||||
|
$ sqoop import --connect jdbc:mysql://db.foo.com/corp --table EMPLOYEES \
|
||||||
|
--columns "employee_id,first_name,last_name,job_title"
|
||||||
|
----
|
||||||
|
|
||||||
|
Controlling the import parallelism (using 8 parallel tasks):
|
||||||
|
|
||||||
|
----
|
||||||
|
$ sqoop import --connect jdbc:mysql://db.foo.com/corp --table EMPLOYEES \
|
||||||
|
-m 8
|
||||||
|
----
|
||||||
|
|
||||||
|
Enabling the MySQL "direct mode" fast path:
|
||||||
|
|
||||||
|
----
|
||||||
|
$ sqoop import --connect jdbc:mysql://db.foo.com/corp --table EMPLOYEES \
|
||||||
|
--direct
|
||||||
|
----
|
||||||
|
|
||||||
|
Storing data in SequenceFiles, and setting the generated class name to
|
||||||
|
+com.foocorp.Employee+:
|
||||||
|
|
||||||
|
----
|
||||||
|
$ sqoop import --connect jdbc:mysql://db.foo.com/corp --table EMPLOYEES \
|
||||||
|
--class-name com.foocorp.Employee --as-sequencefile
|
||||||
|
----
|
||||||
|
|
||||||
|
Specifying the delimiters to use in a text-mode import:
|
||||||
|
|
||||||
|
----
|
||||||
|
$ sqoop import --connect jdbc:mysql://db.foo.com/corp --table EMPLOYEES \
|
||||||
|
--fields-terminated-by '\t' --lines-terminated-by '\n' \
|
||||||
|
--optionally-enclosed-by '\"'
|
||||||
|
----
|
||||||
|
|
||||||
|
Importing the data to Hive:
|
||||||
|
|
||||||
|
----
|
||||||
|
$ sqoop import --connect jdbc:mysql://db.foo.com/corp --table EMPLOYEES \
|
||||||
|
--hive-import
|
||||||
|
----
|
||||||
|
|
||||||
|
Importing only new employees:
|
||||||
|
|
||||||
|
----
|
||||||
|
$ sqoop import --connect jdbc:mysql://db.foo.com/corp --table EMPLOYEES \
|
||||||
|
--where "start_date > '2010-01-01'"
|
||||||
|
----
|
||||||
|
|
||||||
|
Changing the splitting column from the default:
|
||||||
|
|
||||||
|
----
|
||||||
|
$ sqoop import --connect jdbc:mysql://db.foo.com/corp --table EMPLOYEES \
|
||||||
|
--split-by dept_id
|
||||||
|
----
|
||||||
|
|
||||||
|
Verifying that an import was successful:
|
||||||
|
|
||||||
|
----
|
||||||
|
$ hadoop fs -ls EMPLOYEES
|
||||||
|
Found 5 items
|
||||||
|
drwxr-xr-x - someuser somegrp 0 2010-04-27 16:40 /user/someuser/EMPLOYEES/_logs
|
||||||
|
-rw-r--r-- 1 someuser somegrp 2913511 2010-04-27 16:40 /user/someuser/EMPLOYEES/part-m-00000
|
||||||
|
-rw-r--r-- 1 someuser somegrp 1683938 2010-04-27 16:40 /user/someuser/EMPLOYEES/part-m-00001
|
||||||
|
-rw-r--r-- 1 someuser somegrp 7245839 2010-04-27 16:40 /user/someuser/EMPLOYEES/part-m-00002
|
||||||
|
-rw-r--r-- 1 someuser somegrp 7842523 2010-04-27 16:40 /user/someuser/EMPLOYEES/part-m-00003
|
||||||
|
|
||||||
|
$ hadoop fs -cat EMPLOYEES/part-m-00000 | head -n 10
|
||||||
|
0,joe,smith,engineering
|
||||||
|
1,jane,doe,marketing
|
||||||
|
...
|
||||||
|
----
|
||||||
|
|
||||||
|
|
34
src/docs/user/input-args.txt
Normal file
34
src/docs/user/input-args.txt
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
|
||||||
|
////
|
||||||
|
Licensed to Cloudera, Inc. under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
////
|
||||||
|
|
||||||
|
|
||||||
|
.Input parsing arguments:
|
||||||
|
[grid="all"]
|
||||||
|
`----------------------------------------`----------------------------------
|
||||||
|
Argument Description
|
||||||
|
----------------------------------------------------------------------------
|
||||||
|
+\--input-enclosed-by <char>+ Sets a required field encloser
|
||||||
|
+\--input-escaped-by <char>+ Sets the input escape \
|
||||||
|
character
|
||||||
|
+\--input-fields-terminated-by <char>+ Sets the input field separator
|
||||||
|
+\--input-lines-terminated-by <char>+ Sets the input end-of-line \
|
||||||
|
character
|
||||||
|
+\--input-optionally-enclosed-by <char>+ Sets a field enclosing \
|
||||||
|
character
|
||||||
|
----------------------------------------------------------------------------
|
||||||
|
|
45
src/docs/user/intro.txt
Normal file
45
src/docs/user/intro.txt
Normal file
@ -0,0 +1,45 @@
|
|||||||
|
|
||||||
|
////
|
||||||
|
Licensed to Cloudera, Inc. under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
////
|
||||||
|
|
||||||
|
|
||||||
|
Introduction
|
||||||
|
------------
|
||||||
|
|
||||||
|
Sqoop is a tool designed to transfer data between Hadoop and
|
||||||
|
relational databases. You can use Sqoop to import data from a
|
||||||
|
relational database management system (RDBMS) such as MySQL or Oracle
|
||||||
|
into the Hadoop Distributed File System (HDFS),
|
||||||
|
transform the data in Hadoop MapReduce, and then export the data back
|
||||||
|
into an RDBMS.
|
||||||
|
|
||||||
|
Sqoop automates most of this process, relying on the database to
|
||||||
|
describe the schema for the data to be imported. Sqoop uses MapReduce
|
||||||
|
to import and export the data, which provides parallel operation as
|
||||||
|
well as fault tolerance.
|
||||||
|
|
||||||
|
This document describes how to get started using Sqoop to move data
|
||||||
|
between databases and Hadoop and provides reference information for
|
||||||
|
the operation of the Sqoop command-line tool suite. This document is
|
||||||
|
intended for:
|
||||||
|
|
||||||
|
- System and application programmers
|
||||||
|
- System administrators
|
||||||
|
- Database administrators
|
||||||
|
- Data analysts
|
||||||
|
- Data engineers
|
||||||
|
|
@ -17,27 +17,39 @@
|
|||||||
////
|
////
|
||||||
|
|
||||||
|
|
||||||
Generated Class Names
|
+sqoop-list-databases+
|
||||||
~~~~~~~~~~~~~~~~~~~~~
|
----------------------
|
||||||
|
|
||||||
By default, classes are named after the table they represent. e.g.,
|
Purpose
|
||||||
+sqoop --table foo+ will generate a file named +foo.java+. You can
|
~~~~~~~
|
||||||
override the generated class name with the +--class-name+ argument.
|
|
||||||
|
List database schemas present on a server.
|
||||||
|
|
||||||
|
Syntax
|
||||||
|
~~~~~~
|
||||||
|
|
||||||
----
|
----
|
||||||
$ sqoop --connect jdbc:mysql://database.example.com/employees \
|
$ sqoop list-databases (generic-args) (list-databases-args)
|
||||||
--table employee_names --class-name com.example.EmployeeNames
|
$ sqoop-list-databases (generic-args) (list-databases-args)
|
||||||
----
|
----
|
||||||
_This generates a file named +com/example/EmployeeNames.java+_
|
|
||||||
|
|
||||||
If you want to specify a package name for generated classes, but
|
Although the Hadoop generic arguments must preceed any list-databases
|
||||||
still want them to be named after the table they represent, you
|
arguments, the list-databases arguments can be entered in any order
|
||||||
can instead use the argument +--package-name+:
|
with respect to one another.
|
||||||
|
|
||||||
|
include::common-args.txt[]
|
||||||
|
|
||||||
|
Example Invocations
|
||||||
|
~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
List database schemas available on a MySQL server:
|
||||||
|
|
||||||
----
|
----
|
||||||
$ sqoop --connect jdbc:mysql://database.example.com/employees \
|
$ sqoop list-databases --connect jdbc:mysql://database.example.com/
|
||||||
--table employee_names --package-name com.example
|
information_schema
|
||||||
|
employees
|
||||||
----
|
----
|
||||||
_This generates a file named +com/example/employee_names.java+_
|
|
||||||
|
|
||||||
|
NOTE: This only works with HSQLDB and MySQL. A vendor-agnostic implementation
|
||||||
|
of this function has not yet been implemented.
|
||||||
|
|
54
src/docs/user/list-tables.txt
Normal file
54
src/docs/user/list-tables.txt
Normal file
@ -0,0 +1,54 @@
|
|||||||
|
|
||||||
|
////
|
||||||
|
Licensed to Cloudera, Inc. under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
////
|
||||||
|
|
||||||
|
|
||||||
|
+sqoop-list-tables+
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
Purpose
|
||||||
|
~~~~~~~
|
||||||
|
|
||||||
|
List tables present in a database.
|
||||||
|
|
||||||
|
Syntax
|
||||||
|
~~~~~~
|
||||||
|
|
||||||
|
----
|
||||||
|
$ sqoop list-tables (generic-args) (list-tables-args)
|
||||||
|
$ sqoop-list-tables (generic-args) (list-tables-args)
|
||||||
|
----
|
||||||
|
|
||||||
|
Although the Hadoop generic arguments must preceed any list-tables
|
||||||
|
arguments, the list-tables arguments can be entered in any order
|
||||||
|
with respect to one another.
|
||||||
|
|
||||||
|
include::common-args.txt[]
|
||||||
|
|
||||||
|
Example Invocations
|
||||||
|
~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
List tables available in the "corp" database:
|
||||||
|
|
||||||
|
----
|
||||||
|
$ sqoop list-tables --connect jdbc:mysql://database.example.com/corp
|
||||||
|
employees
|
||||||
|
payroll_checks
|
||||||
|
job_descriptions
|
||||||
|
office_supplies
|
||||||
|
----
|
||||||
|
|
35
src/docs/user/output-args.txt
Normal file
35
src/docs/user/output-args.txt
Normal file
@ -0,0 +1,35 @@
|
|||||||
|
|
||||||
|
////
|
||||||
|
Licensed to Cloudera, Inc. under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
////
|
||||||
|
|
||||||
|
.Output line formatting arguments:
|
||||||
|
[grid="all"]
|
||||||
|
`----------------------------------`----------------------------------
|
||||||
|
Argument Description
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
+\--enclosed-by <char>+ Sets a required field enclosing \
|
||||||
|
character
|
||||||
|
+\--escaped-by <char>+ Sets the escape character
|
||||||
|
+\--fields-terminated-by <char>+ Sets the field separator character
|
||||||
|
+\--lines-terminated-by <char>+ Sets the end-of-line character
|
||||||
|
+\--mysql-delimiters+ Uses MySQL's default delimiter set:\
|
||||||
|
fields: +,+ lines: +\n+ \
|
||||||
|
escaped-by: +\+ \
|
||||||
|
optionally-enclosed-by: +'+
|
||||||
|
+\--optionally-enclosed-by <char>+ Sets a field enclosing character
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
|
@ -19,26 +19,26 @@
|
|||||||
The delimiters used to separate fields and records can be specified
|
The delimiters used to separate fields and records can be specified
|
||||||
on the command line, as can a quoting character and an escape character
|
on the command line, as can a quoting character and an escape character
|
||||||
(for quoting delimiters inside a values). Data imported with
|
(for quoting delimiters inside a values). Data imported with
|
||||||
+--as-textfile+ will be formatted according to these parameters. Classes
|
+\--as-textfile+ will be formatted according to these parameters. Classes
|
||||||
generated by Sqoop will encode this information, so using +toString()+
|
generated by Sqoop will encode this information, so using +toString()+
|
||||||
from a data record stored +--as-sequencefile+ will reproduce your
|
from a data record stored +\--as-sequencefile+ will reproduce your
|
||||||
specified formatting.
|
specified formatting.
|
||||||
|
|
||||||
The +(char)+ argument for each argument in this section can be specified
|
The +(char)+ argument for each argument in this section can be specified
|
||||||
either as a normal character (e.g., +--fields-terminated-by ,+) or via
|
either as a normal character (e.g., +\--fields-terminated-by ,+) or via
|
||||||
an escape sequence. Arguments of the form +\0xhhh+ will be interpreted
|
an escape sequence. Arguments of the form +\0xhhh+ will be interpreted
|
||||||
as a hexidecimal representation of a character with hex number _hhh_.
|
as a hexidecimal representation of a character with hex number _hhh_.
|
||||||
Arguments of the form +\0ooo+ will be treated as an octal representation
|
Arguments of the form +\0ooo+ will be treated as an octal representation
|
||||||
of a character represented by octal number _ooo_. The special escapes
|
of a character represented by octal number _ooo_. The special escapes
|
||||||
+\n+, +\r+, +\"+, +\b+, +\t+, and +\\+ act as they do inside Java strings. +\0+ will be
|
+\n+, +\r+, +\"+, +\b+, +\t+, and +\\+ act as they do inside Java strings. +\0+ will be
|
||||||
treated as NUL. This will insert NUL characters between fields or lines
|
treated as NUL. This will insert NUL characters between fields or lines
|
||||||
(if used for +--fields-terminated-by+ or +--lines-terminated-by+), or will
|
(if used for +\--fields-terminated-by+ or +\--lines-terminated-by+), or will
|
||||||
disable enclosing/escaping if used for one of the +--enclosed-by+,
|
disable enclosing/escaping if used for one of the +\--enclosed-by+,
|
||||||
+--optionally-enclosed-by+, or +--escaped-by+ arguments.
|
+\--optionally-enclosed-by+, or +\--escaped-by+ arguments.
|
||||||
|
|
||||||
The default delimiters are +,+ for fields, +\n+ for records, no quote
|
The default delimiters are +,+ for fields, +\n+ for records, no quote
|
||||||
character, and no escape character. Note that this can lead to
|
character, and no escape character. Note that this can lead to
|
||||||
ambiguous/unparsible records if you import database records containing
|
ambiguous/unparsible records if you import database records containing
|
||||||
commas or newlines in the field data. For unambiguous parsing, both must
|
commas or newlines in the field data. For unambiguous parsing, both must
|
||||||
be enabled, e.g., via +--mysql-delimiters+.
|
be enabled, e.g., via +\--mysql-delimiters+.
|
||||||
|
|
61
src/docs/user/preface.txt
Normal file
61
src/docs/user/preface.txt
Normal file
@ -0,0 +1,61 @@
|
|||||||
|
|
||||||
|
////
|
||||||
|
Licensed to Cloudera, Inc. under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
////
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Supported Releases
|
||||||
|
------------------
|
||||||
|
|
||||||
|
This documentation applies to Sqoop v1.0.0 (June 2010).
|
||||||
|
|
||||||
|
Sqoop Releases
|
||||||
|
--------------
|
||||||
|
|
||||||
|
Sqoop is an open source software product of Cloudera, Inc.
|
||||||
|
|
||||||
|
Software development for Sqoop occurs at http://github.com/cloudera/sqoop.
|
||||||
|
At that site you can obtain:
|
||||||
|
- New releases of Sqoop as well as its most recent source code
|
||||||
|
- An issue tracker
|
||||||
|
- A wiki that contains Sqoop documentation
|
||||||
|
|
||||||
|
Sqoop is compatible with Apache Hadoop 0.21 and Cloudera's
|
||||||
|
Distribution of Hadoop version 3.
|
||||||
|
|
||||||
|
Prerequisites
|
||||||
|
-------------
|
||||||
|
|
||||||
|
The following prerequisite knowledge is required for this product:
|
||||||
|
|
||||||
|
- Basic computer technology and terminology
|
||||||
|
- Familiarity with command-line interfaces such as +bash+
|
||||||
|
- Relational database management systems
|
||||||
|
- Basic familiarity with the purpose and operation of Hadoop
|
||||||
|
|
||||||
|
Before you can use Sqoop, a release of Hadoop must be installed and
|
||||||
|
configured. We recommend that you download Cloudera's Distribution
|
||||||
|
for Hadoop (CDH3) from the Cloudera Software Archive at
|
||||||
|
http://archive.cloudera.com for straightforward installation of Hadoop
|
||||||
|
on Linux systems.
|
||||||
|
|
||||||
|
This document assumes you are using a Linux or Linux-like environment.
|
||||||
|
If you are using Windows, you may be able to use cygwin to accomplish
|
||||||
|
most of the following tasks. If you are using Mac OS X, you should see
|
||||||
|
few (if any) compatibility errors. Sqoop is predominantly operated and
|
||||||
|
tested on Linux.
|
||||||
|
|
33
src/docs/user/support.txt
Normal file
33
src/docs/user/support.txt
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
|
||||||
|
////
|
||||||
|
Licensed to Cloudera, Inc. under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
////
|
||||||
|
|
||||||
|
Getting Support
|
||||||
|
---------------
|
||||||
|
|
||||||
|
Report bugs in Sqoop to the issue tracker at
|
||||||
|
http://github.com/cloudera/sqoop/issues[].
|
||||||
|
|
||||||
|
For general questions and answers, a support forum is available at
|
||||||
|
http://getsatisfaction.com/cloudera/products/cloudera_sqoop[].
|
||||||
|
|
||||||
|
Before contacting either forum, run your Sqoop job with the
|
||||||
|
+\--verbose+ flag to acquire as much debugging information as
|
||||||
|
possible. Also report the string returned by +sqoop version+ as
|
||||||
|
well as the version of Hadoop you are running (+hadoop version+).
|
||||||
|
|
||||||
|
|
168
src/docs/user/tools.txt
Normal file
168
src/docs/user/tools.txt
Normal file
@ -0,0 +1,168 @@
|
|||||||
|
|
||||||
|
////
|
||||||
|
Licensed to Cloudera, Inc. under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
////
|
||||||
|
|
||||||
|
Sqoop Tools
|
||||||
|
-----------
|
||||||
|
|
||||||
|
Sqoop is a collection of related tools. To use Sqoop, you specify the
|
||||||
|
tool you want to use and the arguments that control the tool.
|
||||||
|
|
||||||
|
If Sqoop is compiled from its own source, you can run Sqoop without a formal
|
||||||
|
installation process by running the +bin/sqoop+ program. Users
|
||||||
|
of a packaged deployment of Sqoop (such as an RPM shipped with Cloudera's
|
||||||
|
Distribution for Hadoop) will see this program installed as +/usr/bin/sqoop+.
|
||||||
|
The remainder of this documentation will refer to this program as
|
||||||
|
+sqoop+. For example:
|
||||||
|
|
||||||
|
----
|
||||||
|
$ sqoop tool-name [tool-arguments]
|
||||||
|
----
|
||||||
|
|
||||||
|
NOTE: The following examples that begin with a +$+ character indicate
|
||||||
|
that the commands must be entered at a terminal prompt (such as
|
||||||
|
+bash+). The +$+ character represents the prompt itself; you should
|
||||||
|
not start these commands by typing a +$+. You can also enter commands
|
||||||
|
inline in the text of a paragraph; for example, +sqoop help+. These
|
||||||
|
examples do not show a +$+ prefix, but you should enter them the same
|
||||||
|
way. Don't confuse the +$+ shell prompt in the examples with the +$+
|
||||||
|
that precedes an environment variable name. For example, the string
|
||||||
|
literal +$HADOOP_HOME+ includes a "+$+".
|
||||||
|
|
||||||
|
Sqoop ships with a help tool. To display a list of all available
|
||||||
|
tools, type the following command:
|
||||||
|
|
||||||
|
----
|
||||||
|
$ sqoop help
|
||||||
|
usage: sqoop COMMAND [ARGS]
|
||||||
|
|
||||||
|
Available commands:
|
||||||
|
codegen Generate code to interact with database records
|
||||||
|
create-hive-table Import a table definition into Hive
|
||||||
|
eval Evaluate a SQL statement and display the results
|
||||||
|
export Export an HDFS directory to a database table
|
||||||
|
help List available commands
|
||||||
|
import Import a table from a database to HDFS
|
||||||
|
import-all-tables Import tables from a database to HDFS
|
||||||
|
list-databases List available databases on a server
|
||||||
|
list-tables List available tables in a database
|
||||||
|
version Display version information
|
||||||
|
|
||||||
|
See 'sqoop help COMMAND' for information on a specific command.
|
||||||
|
----
|
||||||
|
|
||||||
|
You can display help for a specific tool by entering: +sqoop help
|
||||||
|
(tool-name)+; for example, +sqoop help import+.
|
||||||
|
|
||||||
|
You can also add the +\--help+ argument to any command: +sqoop import
|
||||||
|
\--help+.
|
||||||
|
|
||||||
|
Using Command Aliases
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
In addition to typing the +sqoop (toolname)+ syntax, you can use alias
|
||||||
|
scripts that specify the +sqoop-(toolname)+ syntax. For example, the
|
||||||
|
scripts +sqoop-import+, +sqoop-export+, etc. each select a specific
|
||||||
|
tool.
|
||||||
|
|
||||||
|
Controlling the Hadoop Installation
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
You invoke Sqoop through the program launch capability provided by
|
||||||
|
Hadoop. The +sqoop+ command-line program is a wrapper which runs the
|
||||||
|
+bin/hadoop+ script shipped with Hadoop. If you have multiple
|
||||||
|
installations of Hadoop present on your machine, you can select the
|
||||||
|
Hadoop installation by setting the +$HADOOP_HOME+ environment
|
||||||
|
variable.
|
||||||
|
|
||||||
|
For example:
|
||||||
|
|
||||||
|
----
|
||||||
|
$ HADOOP_HOME=/path/to/some/hadoop sqoop import --arguments...
|
||||||
|
----
|
||||||
|
|
||||||
|
or:
|
||||||
|
|
||||||
|
----
|
||||||
|
$ export HADOOP_HOME=/some/path/to/hadoop
|
||||||
|
$ sqoop import --arguments...
|
||||||
|
-----
|
||||||
|
|
||||||
|
If +$HADOOP_HOME+ is not set, Sqoop will use the default installation
|
||||||
|
location for Cloudera's Distribution for Hadoop, +/usr/lib/hadoop+.
|
||||||
|
|
||||||
|
The active Hadoop configuration is loaded from +$HADOOP_HOME/conf/+,
|
||||||
|
unless the +$HADOOP_CONF_DIR+ environment variable is set.
|
||||||
|
|
||||||
|
|
||||||
|
Using Generic and Specific Arguments
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
To control the operation of each Sqoop tool, you use generic and
|
||||||
|
specific arguments.
|
||||||
|
|
||||||
|
For example:
|
||||||
|
|
||||||
|
----
|
||||||
|
$ sqoop help import
|
||||||
|
usage: sqoop import [GENERIC-ARGS] [TOOL-ARGS]
|
||||||
|
|
||||||
|
Common arguments:
|
||||||
|
--connect <jdbc-uri> Specify JDBC connect string
|
||||||
|
--driver <class-name> Manually specify JDBC driver class to use
|
||||||
|
--hadoop-home <dir> Override $HADOOP_HOME
|
||||||
|
--help Print usage instructions
|
||||||
|
-P Read password from console
|
||||||
|
--password <password> Set authentication password
|
||||||
|
--username <username> Set authentication username
|
||||||
|
--verbose Print more information while working
|
||||||
|
|
||||||
|
[...]
|
||||||
|
|
||||||
|
Generic Hadoop command-line arguments:
|
||||||
|
(must preceed any tool-specific arguments)
|
||||||
|
Generic options supported are
|
||||||
|
-conf <configuration file> specify an application configuration file
|
||||||
|
-D <property=value> use value for given property
|
||||||
|
-fs <local|namenode:port> specify a namenode
|
||||||
|
-jt <local|jobtracker:port> specify a job tracker
|
||||||
|
-files <comma separated list of files> specify comma separated files to be copied to the map reduce cluster
|
||||||
|
-libjars <comma separated list of jars> specify comma separated jar files to include in the classpath.
|
||||||
|
-archives <comma separated list of archives> specify comma separated archives to be unarchived on the compute machines.
|
||||||
|
|
||||||
|
The general command line syntax is
|
||||||
|
bin/hadoop command [genericOptions] [commandOptions]
|
||||||
|
----
|
||||||
|
|
||||||
|
You must supply the generic arguments +-conf+, +-D+, and so on after the
|
||||||
|
tool name but *before* any tool-specific arguments (such as
|
||||||
|
+\--connect+). Note that generic Hadoop arguments are preceeded by a
|
||||||
|
single dash character (+-+), whereas tool-specific arguments start
|
||||||
|
with two dashes (+\--+), unless they are single character arguments such as +-P+.
|
||||||
|
|
||||||
|
The +-conf+, +-D+, +-fs+ and +-jt+ arguments control the configuration
|
||||||
|
and Hadoop server settings. The +-files+, +-libjars+, and +-archives+
|
||||||
|
arguments are not typically used with Sqoop, but they are included as
|
||||||
|
part of Hadoop's internal argument-parsing system.
|
||||||
|
|
||||||
|
|
||||||
|
Using Tools
|
||||||
|
~~~~~~~~~~~
|
||||||
|
|
||||||
|
The following sections will describe each tool's operation. The
|
||||||
|
tools are listed in the most likely order you will find them useful.
|
||||||
|
|
@ -17,18 +17,32 @@
|
|||||||
////
|
////
|
||||||
|
|
||||||
|
|
||||||
Listing Available Tables
|
+sqoop-version+
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~
|
---------------
|
||||||
|
|
||||||
Within a database, you can list the tables available for import with
|
Purpose
|
||||||
the +--list-tables+ command. The following example shows four tables available
|
~~~~~~~
|
||||||
within the "employees" example database:
|
|
||||||
|
Display version information for Sqoop.
|
||||||
|
|
||||||
|
Syntax
|
||||||
|
~~~~~~
|
||||||
|
|
||||||
----
|
----
|
||||||
$ sqoop --connect jdbc:mysql://database.example.com/employees --list-tables
|
$ sqoop version
|
||||||
employee_names
|
$ sqoop-version
|
||||||
payroll_checks
|
----
|
||||||
job_descriptions
|
|
||||||
office_supplies
|
|
||||||
|
Example Invocations
|
||||||
|
~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
Display the version:
|
||||||
|
|
||||||
|
----
|
||||||
|
$ sqoop version
|
||||||
|
Sqoop 1.0.0
|
||||||
|
git commit id 46b3e06b79a8411320d77c984c3030db47dd1c22
|
||||||
|
Compiled by aaron@jargon on Mon May 17 13:43:22 PDT 2010
|
||||||
----
|
----
|
||||||
|
|
@ -1092,7 +1092,11 @@ public abstract static class Reader implements Closeable {
|
|||||||
public abstract Path getPath();
|
public abstract Path getPath();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Report the current position in the file.
|
* Report the current position in the file. Note that the internal
|
||||||
|
* cursor may move in an unpredictable fashion; e.g., to fetch
|
||||||
|
* additional data from the index stored at the end of the file.
|
||||||
|
* Clients may be more interested in the getRecordOffset() method
|
||||||
|
* which returns the starting offset of the current record.
|
||||||
* @return the current offset from the start of the file in bytes.
|
* @return the current offset from the start of the file in bytes.
|
||||||
*/
|
*/
|
||||||
public abstract long tell() throws IOException;
|
public abstract long tell() throws IOException;
|
||||||
|
@ -102,7 +102,7 @@ public abstract class BaseSqoopTool extends SqoopTool {
|
|||||||
public static final String PACKAGE_NAME_ARG = "package-name";
|
public static final String PACKAGE_NAME_ARG = "package-name";
|
||||||
public static final String CLASS_NAME_ARG = "class-name";
|
public static final String CLASS_NAME_ARG = "class-name";
|
||||||
public static final String JAR_FILE_NAME_ARG = "jar-file";
|
public static final String JAR_FILE_NAME_ARG = "jar-file";
|
||||||
public static final String DEBUG_SQL_ARG = "expr";
|
public static final String DEBUG_SQL_ARG = "query";
|
||||||
public static final String DEBUG_SQL_SHORT_ARG = "e";
|
public static final String DEBUG_SQL_SHORT_ARG = "e";
|
||||||
public static final String VERBOSE_ARG = "verbose";
|
public static final String VERBOSE_ARG = "verbose";
|
||||||
public static final String HELP_ARG = "help";
|
public static final String HELP_ARG = "help";
|
||||||
@ -399,7 +399,7 @@ protected RelatedOptions getCodeGenOpts(boolean multiTable) {
|
|||||||
if (!multiTable) {
|
if (!multiTable) {
|
||||||
codeGenOpts.addOption(OptionBuilder.withArgName("name")
|
codeGenOpts.addOption(OptionBuilder.withArgName("name")
|
||||||
.hasArg()
|
.hasArg()
|
||||||
.withDescription("Sets the generated class name."
|
.withDescription("Sets the generated class name. "
|
||||||
+ "This overrides --" + PACKAGE_NAME_ARG + ". When combined "
|
+ "This overrides --" + PACKAGE_NAME_ARG + ". When combined "
|
||||||
+ "with --" + JAR_FILE_NAME_ARG + ", sets the input class.")
|
+ "with --" + JAR_FILE_NAME_ARG + ", sets the input class.")
|
||||||
.withLongOpt(CLASS_NAME_ARG)
|
.withLongOpt(CLASS_NAME_ARG)
|
||||||
|
Loading…
Reference in New Issue
Block a user