mirror of
https://github.com/apache/sqoop.git
synced 2025-05-04 00:43:42 +08:00
Rewrite documentation for updated Sqoop API
Add documentation for all SqoopTool implementations. Add database compatibility notes. Separate user guide from the develpment guide. From: Aaron Kimball <aaron@cloudera.com> git-svn-id: https://svn.apache.org/repos/asf/incubator/sqoop/trunk@1149902 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
15f5375d76
commit
faabc51a90
@ -16,28 +16,40 @@
|
||||
BUILDROOT=../../build
|
||||
BUILD_DIR=$(BUILDROOT)/docs
|
||||
|
||||
all: man userguide
|
||||
all: man userguide devguide
|
||||
|
||||
man: $(BUILD_DIR)/sqoop.1.gz
|
||||
|
||||
userguide: $(BUILD_DIR)/SqoopUserGuide.html
|
||||
|
||||
$(BUILD_DIR)/sqoop.1.gz: Sqoop-manpage.txt *formatting*.txt
|
||||
asciidoc -b docbook -d manpage Sqoop-manpage.txt
|
||||
devguide: $(BUILD_DIR)/SqoopDevGuide.html
|
||||
|
||||
$(BUILD_DIR)/sqoop.1.gz: user/Sqoop-manpage.txt user/*formatting*.txt
|
||||
asciidoc -b docbook -d manpage user/Sqoop-manpage.txt
|
||||
xmlto man Sqoop-manpage.xml
|
||||
gzip sqoop.1
|
||||
rm Sqoop-manpage.xml
|
||||
mkdir -p $(BUILD_DIR)
|
||||
mv sqoop.1.gz $(BUILD_DIR)
|
||||
|
||||
$(BUILD_DIR)/SqoopUserGuide.html: SqoopUserGuide.txt *.txt
|
||||
asciidoc SqoopUserGuide.txt
|
||||
$(BUILD_DIR)/SqoopUserGuide.html: user/*.txt
|
||||
asciidoc -a toc -a toclevels=1 -a "toc-title=Table of Contents" \
|
||||
user/SqoopUserGuide.txt
|
||||
mkdir -p $(BUILD_DIR)
|
||||
mv SqoopUserGuide.html $(BUILD_DIR)
|
||||
mv user/SqoopUserGuide.html $(BUILD_DIR)
|
||||
|
||||
$(BUILD_DIR)/SqoopDevGuide.html: dev/*.txt
|
||||
asciidoc -a toc -a toclevels=1 -a "toc-title=Table of Contents" \
|
||||
dev/SqoopDevGuide.txt
|
||||
mkdir -p $(BUILD_DIR)
|
||||
mv dev/SqoopDevGuide.html $(BUILD_DIR)
|
||||
|
||||
clean:
|
||||
-rm $(BUILD_DIR)/sqoop.1.gz
|
||||
-rm $(BUILD_DIR)/SqoopUserGuide.html
|
||||
-rm $(BUILD_DIR)/SqoopUserGuide.pdf
|
||||
-rm user/SqoopUserGuide.html
|
||||
-rm dev/SqoopDevGuide.html
|
||||
|
||||
.PHONY: all man userguide clean
|
||||
.PHONY: all man userguide devguide clean
|
||||
|
||||
|
@ -1,67 +0,0 @@
|
||||
|
||||
////
|
||||
Licensed to Cloudera, Inc. under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
////
|
||||
|
||||
|
||||
include::intro.txt[]
|
||||
|
||||
|
||||
The Sqoop Command Line
|
||||
----------------------
|
||||
|
||||
To execute Sqoop, run with Hadoop:
|
||||
----
|
||||
$ bin/hadoop jar contrib/sqoop/hadoop-$(version)-sqoop.jar (arguments)
|
||||
----
|
||||
|
||||
NOTE:Throughput this document, we will use `sqoop` as shorthand for the
|
||||
above. i.e., `$ sqoop (arguments)`
|
||||
|
||||
You pass this program options describing the
|
||||
import job you want to perform. If you need a hint, running Sqoop with
|
||||
`--help` will print out a list of all the command line
|
||||
options available. The +sqoop(1)+ manual page will also describe
|
||||
Sqoop's available arguments in greater detail. The manual page is built
|
||||
in `$HADOOP_HOME/build/contrib/sqoop/doc/sqoop.1.gz`.
|
||||
The following subsections will describe the most common modes of operation.
|
||||
|
||||
include::connecting.txt[]
|
||||
|
||||
include::listing-dbs.txt[]
|
||||
|
||||
include::listing-tables.txt[]
|
||||
|
||||
include::full-db-import.txt[]
|
||||
|
||||
include::table-import.txt[]
|
||||
|
||||
include::controlling-output-format.txt[]
|
||||
|
||||
include::classnames.txt[]
|
||||
|
||||
include::misc-args.txt[]
|
||||
|
||||
include::direct.txt[]
|
||||
|
||||
include::hive.txt[]
|
||||
|
||||
include::export.txt[]
|
||||
|
||||
include::supported-dbs.txt[]
|
||||
|
||||
include::api-reference.txt[]
|
||||
|
@ -16,20 +16,13 @@
|
||||
limitations under the License.
|
||||
////
|
||||
|
||||
include::intro.txt[]
|
||||
|
||||
Listing Available Databases
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
include::preface.txt[]
|
||||
|
||||
include::compiling.txt[]
|
||||
|
||||
include::api-reference.txt[]
|
||||
|
||||
Once connected to a database server, you can list the available
|
||||
databases with the +--list-databases+ parameter. This currently is supported
|
||||
only by HSQLDB and MySQL. Note that in this case, the connect string does
|
||||
not include a database name, just a server address.
|
||||
|
||||
----
|
||||
$ sqoop --connect jdbc:mysql://database.example.com/ --list-databases
|
||||
information_schema
|
||||
employees
|
||||
----
|
||||
_This only works with HSQLDB and MySQL. A vendor-agnostic implementation of
|
||||
this function has not yet been implemented._
|
||||
|
@ -19,29 +19,38 @@
|
||||
Developer API Reference
|
||||
-----------------------
|
||||
|
||||
This section is intended to specify the APIs available to application writers
|
||||
integrating with Sqoop, and those modifying Sqoop. The next three subsections
|
||||
are written from the following three perspectives: those using classes generated
|
||||
by Sqoop, and its public library; those writing Sqoop extensions (i.e.,
|
||||
additional ConnManager implementations that interact with more databases); and
|
||||
those modifying Sqoop's internals. Each section describes the system in
|
||||
successively greater depth.
|
||||
This section specifies the APIs available to application writers who
|
||||
want to integrate with Sqoop, and those who want to modify Sqoop.
|
||||
|
||||
The next three subsections are written for the following use cases:
|
||||
|
||||
- Using classes generated by Sqoop and its public library
|
||||
- Writing Sqoop extensions (that is, additional ConnManager implementations
|
||||
that interact with more databases)
|
||||
- Modifying Sqoop's internals
|
||||
|
||||
Each section describes the system in successively greater depth.
|
||||
|
||||
|
||||
The External API
|
||||
~~~~~~~~~~~~~~~~
|
||||
|
||||
Sqoop auto-generates classes that represent the tables imported into HDFS. The
|
||||
class contains member fields for each column of the imported table; an instance
|
||||
of the class holds one row of the table. The generated classes implement the
|
||||
serialization APIs used in Hadoop, namely the _Writable_ and _DBWritable_
|
||||
interfaces. They also contain other convenience methods: a +parse()+ method
|
||||
that interprets delimited text fields, and a +toString()+ method that preserves
|
||||
the user's chosen delimiters. The full set of methods guaranteed to exist in an
|
||||
auto-generated class are specified in the interface
|
||||
Sqoop automatically generates classes that represent the tables
|
||||
imported into the Hadoop Distributed File System (HDFS). The class
|
||||
contains member fields for each column of the imported table; an
|
||||
instance of the class holds one row of the table. The generated
|
||||
classes implement the serialization APIs used in Hadoop, namely the
|
||||
_Writable_ and _DBWritable_ interfaces. They also contain these other
|
||||
convenience methods:
|
||||
|
||||
- A parse() method that interprets delimited text fields
|
||||
- A toString() method that preserves the user's chosen delimiters
|
||||
|
||||
The full set of methods guaranteed to exist in an auto-generated class
|
||||
is specified in the abstract class
|
||||
+org.apache.hadoop.sqoop.lib.SqoopRecord+.
|
||||
|
||||
Instances of _SqoopRecord_ may depend on Sqoop's public API. This is all classes
|
||||
Instances of +SqoopRecord+ may depend on Sqoop's public API. This is all classes
|
||||
in the +org.apache.hadoop.sqoop.lib+ package. These are briefly described below.
|
||||
Clients of Sqoop should not need to directly interact with any of these classes,
|
||||
although classes generated by Sqoop will depend on them. Therefore, these APIs
|
||||
@ -57,16 +66,21 @@ are considered public and care will be taken when forward-evolving them.
|
||||
* +BigDecimalSerializer+ contains a pair of methods that facilitate
|
||||
serialization of +BigDecimal+ objects over the _Writable_ interface.
|
||||
|
||||
The full specification of the public API is available on the Sqoop
|
||||
Development Wiki as
|
||||
http://wiki.github.com/cloudera/sqoop/sip-4[SIP-4].
|
||||
|
||||
|
||||
The Extension API
|
||||
~~~~~~~~~~~~~~~~~
|
||||
|
||||
This section covers the API and primary classes used by extensions for Sqoop
|
||||
which allow Sqoop to interface with more database vendors.
|
||||
|
||||
While Sqoop uses JDBC and +DBInputFormat+ (and +DataDrivenDBInputFormat+) to
|
||||
While Sqoop uses JDBC and +DataDrivenDBInputFormat+ to
|
||||
read from databases, differences in the SQL supported by different vendors as
|
||||
well as JDBC metadata necessitates vendor-specific codepaths for most databases.
|
||||
Sqoop's solution to this problem is by introducing the ConnManager API
|
||||
Sqoop's solution to this problem is by introducing the +ConnManager+ API
|
||||
(+org.apache.hadoop.sqoop.manager.ConnMananger+).
|
||||
|
||||
+ConnManager+ is an abstract class defining all methods that interact with the
|
||||
@ -80,40 +94,46 @@ selectively override behavior. For example, the +getColNamesQuery()+ method
|
||||
allows the SQL query used by +getColNames()+ to be modified without needing to
|
||||
rewrite the majority of +getColNames()+.
|
||||
|
||||
+ConnManager+ implementations receive a lot of their configuration data from a
|
||||
Sqoop-specific class, +SqoopOptions+. While +SqoopOptions+ does not currently
|
||||
contain many setter methods, clients should not assume +SqoopOptions+ are
|
||||
immutable. More setter methods may be added in the future. +SqoopOptions+ does
|
||||
not directly store specific per-manager options. Instead, it contains a
|
||||
reference to the +Configuration+ returned by +Tool.getConf()+ after parsing
|
||||
command-line arguments with the +GenericOptionsParser+. This allows extension
|
||||
arguments via "+-D any.specific.param=any.value+" without requiring any layering
|
||||
of options parsing or modification of +SqoopOptions+.
|
||||
+ConnManager+ implementations receive a lot of their configuration
|
||||
data from a Sqoop-specific class, +SqoopOptions+. +SqoopOptions+ are
|
||||
mutable. +SqoopOptions+ does not directly store specific per-manager
|
||||
options. Instead, it contains a reference to the +Configuration+
|
||||
returned by +Tool.getConf()+ after parsing command-line arguments with
|
||||
the +GenericOptionsParser+. This allows extension arguments via "+-D
|
||||
any.specific.param=any.value+" without requiring any layering of
|
||||
options parsing or modification of +SqoopOptions+. This
|
||||
+Configuration+ forms the basis of the +Configuration+ passed to any
|
||||
MapReduce +Job+ invoked in the workflow, so that users can set on the
|
||||
command-line any necessary custom Hadoop state.
|
||||
|
||||
All existing +ConnManager+ implementations are stateless. Thus, the system which
|
||||
instantiates +ConnManagers+ may implement multiple instances of the same
|
||||
+ConnMananger+ class over Sqoop's lifetime. If a caching layer is required, we
|
||||
can add one later, but it is not currently available.
|
||||
All existing +ConnManager+ implementations are stateless. Thus, the
|
||||
system which instantiates +ConnManagers+ may implement multiple
|
||||
instances of the same +ConnMananger+ class over Sqoop's lifetime. It
|
||||
is currently assumed that instantiating a +ConnManager+ is a
|
||||
lightweight operation, and is done reasonably infrequently. Therefore,
|
||||
+ConnManagers+ are not cached between operations, etc.
|
||||
|
||||
+ConnManagers+ are currently created by instances of the abstract class +ManagerFactory+ (See
|
||||
MAPREDUCE-750). One +ManagerFactory+ implementation currently serves all of
|
||||
Sqoop: +org.apache.hadoop.sqoop.manager.DefaultManagerFactory+. Extensions
|
||||
should not modify +DefaultManagerFactory+. Instead, an extension-specific
|
||||
+ManagerFactory+ implementation should be provided with the new ConnManager.
|
||||
+ManagerFactory+ has a single method of note, named +accept()+. This method will
|
||||
determine whether it can instantiate a +ConnManager+ for the user's
|
||||
+SqoopOptions+. If so, it returns the +ConnManager+ instance. Otherwise, it
|
||||
returns +null+.
|
||||
+ConnManagers+ are currently created by instances of the abstract
|
||||
class +ManagerFactory+ (See
|
||||
http://issues.apache.org/jira/browse/MAPREDUCE-750[]). One
|
||||
+ManagerFactory+ implementation currently serves all of Sqoop:
|
||||
+org.apache.hadoop.sqoop.manager.DefaultManagerFactory+. Extensions
|
||||
should not modify +DefaultManagerFactory+. Instead, an
|
||||
extension-specific +ManagerFactory+ implementation should be provided
|
||||
with the new +ConnManager+. +ManagerFactory+ has a single method of
|
||||
note, named +accept()+. This method will determine whether it can
|
||||
instantiate a +ConnManager+ for the user's +SqoopOptions+. If so, it
|
||||
returns the +ConnManager+ instance. Otherwise, it returns +null+.
|
||||
|
||||
The +ManagerFactory+ implementations used are governed by the
|
||||
+sqoop.connection.factories+ setting in sqoop-site.xml. Users of extension
|
||||
+sqoop.connection.factories+ setting in +sqoop-site.xml+. Users of extension
|
||||
libraries can install the 3rd-party library containing a new +ManagerFactory+
|
||||
and +ConnManager+(s), and configure sqoop-site.xml to use the new
|
||||
and +ConnManager+(s), and configure +sqoop-site.xml+ to use the new
|
||||
+ManagerFactory+. The +DefaultManagerFactory+ principly discriminates between
|
||||
databases by parsing the connect string stored in +SqoopOptions+.
|
||||
|
||||
Extension authors may make use of classes in the +org.apache.hadoop.sqoop.io+,
|
||||
+mapred+, +mapreduce+, and +util+ packages to facilitate their implementations.
|
||||
+mapreduce+, and +util+ packages to facilitate their implementations.
|
||||
These packages and classes are described in more detail in the following
|
||||
section.
|
||||
|
||||
@ -134,35 +154,43 @@ General program flow
|
||||
The general program flow is as follows:
|
||||
|
||||
+org.apache.hadoop.sqoop.Sqoop+ is the main class and implements _Tool_. A new
|
||||
instance is launched with +ToolRunner+. It parses its arguments using the
|
||||
+SqoopOptions+ class. Within the +SqoopOptions+, an +ImportAction+ will be
|
||||
chosen by the user. This may be import all tables, import one specific table,
|
||||
execute a SQL statement, or others.
|
||||
instance is launched with +ToolRunner+. The first argument to Sqoop is
|
||||
a string identifying the name of a +SqoopTool+ to run. The +SqoopTool+
|
||||
itself drives the execution of the user's requested operation (e.g.,
|
||||
import, export, codegen, etc).
|
||||
|
||||
A +ConnManager+ is then instantiated based on the data in the +SqoopOptions+.
|
||||
The +ConnFactory+ is used to get a +ConnManager+ from a +ManagerFactory+; the
|
||||
mechanics of this were described in an earlier section.
|
||||
The +SqoopTool+ API is specified fully in
|
||||
http://wiki.github.com/cloudera/sqoop/sip-1[SIP-1].
|
||||
|
||||
Then in the +run()+ method, using a case statement, it determines which actions
|
||||
the user needs performed based on the +ImportAction+ enum. Usually this involves
|
||||
determining a list of tables to import, generating user code for them, and
|
||||
running a MapReduce job per table to read the data. The import itself does not
|
||||
specifically need to be run via a MapReduce job; the +ConnManager.importTable()+
|
||||
method is left to determine how best to run the import. Each of these actions is
|
||||
controlled by the +ConnMananger+, except for the generating of code, which is
|
||||
done by the +CompilationManager+ and +ClassWriter+. (Both in the
|
||||
+org.apache.hadoop.sqoop.orm+ package.) Importing into Hive is also taken care
|
||||
of via the +org.apache.hadoop.sqoop.hive.HiveImport+ class after the
|
||||
+importTable()+ has completed. This is done without concern for the
|
||||
+ConnManager+ implementation used.
|
||||
The chosen +SqoopTool+ will parse the remainder of the arguments,
|
||||
setting the appropriate fields in the +SqoopOptions+ class. It will
|
||||
then run its body.
|
||||
|
||||
A ConnManager's +importTable()+ method receives a single argument of type
|
||||
+ImportJobContext+ which contains parameters to the method. This class may be
|
||||
extended with additional parameters in the future, which optionally further
|
||||
direct the import operation. Similarly, the +exportTable()+ method receives an
|
||||
argument of type +ExportJobContext+. These classes contain the name of the table
|
||||
to import/export, a reference to the +SqoopOptions+ object, and other related
|
||||
data.
|
||||
Then in the SqoopTool's +run()+ method, the import or export or other
|
||||
action proper is executed. Typically, a +ConnManager+ is then
|
||||
instantiated based on the data in the +SqoopOptions+. The
|
||||
+ConnFactory+ is used to get a +ConnManager+ from a +ManagerFactory+;
|
||||
the mechanics of this were described in an earlier section. Imports
|
||||
and exports and other large data motion tasks typically run a
|
||||
MapReduce job to operate on a table in a parallel, reliable fashion.
|
||||
An import does not specifically need to be run via a MapReduce job;
|
||||
the +ConnManager.importTable()+ method is left to determine how best
|
||||
to run the import. Each main action is actually controlled by the
|
||||
+ConnMananger+, except for the generating of code, which is done by
|
||||
the +CompilationManager+ and +ClassWriter+. (Both in the
|
||||
+org.apache.hadoop.sqoop.orm+ package.) Importing into Hive is also
|
||||
taken care of via the +org.apache.hadoop.sqoop.hive.HiveImport+ class
|
||||
after the +importTable()+ has completed. This is done without concern
|
||||
for the +ConnManager+ implementation used.
|
||||
|
||||
A ConnManager's +importTable()+ method receives a single argument of
|
||||
type +ImportJobContext+ which contains parameters to the method. This
|
||||
class may be extended with additional parameters in the future, which
|
||||
optionally further direct the import operation. Similarly, the
|
||||
+exportTable()+ method receives an argument of type
|
||||
+ExportJobContext+. These classes contain the name of the table to
|
||||
import/export, a reference to the +SqoopOptions+ object, and other
|
||||
related data.
|
||||
|
||||
Subpackages
|
||||
^^^^^^^^^^^
|
||||
@ -175,8 +203,9 @@ The following subpackages under +org.apache.hadoop.sqoop+ exist:
|
||||
* +lib+ - The external public API (described earlier).
|
||||
* +manager+ - The +ConnManager+ and +ManagerFactory+ interface and their
|
||||
implementations.
|
||||
* +mapreduce+ - Classes interfacing with the new (0.20+) MapReduce API....
|
||||
* +mapreduce+ - Classes interfacing with the new (0.20+) MapReduce API.
|
||||
* +orm+ - Code auto-generation.
|
||||
* +tool+ - Implementations of +SqoopTool+.
|
||||
* +util+ - Miscellaneous utility classes.
|
||||
|
||||
The +io+ package contains _OutputStream_ and _BufferedWriter_ implementations
|
||||
@ -185,11 +214,13 @@ BufferedWriter to be opened to a client which will, under the hood, write to
|
||||
multiple files in series as they reach a target threshold size. This allows
|
||||
unsplittable compression libraries (e.g., gzip) to be used in conjunction with
|
||||
Sqoop import while still allowing subsequent MapReduce jobs to use multiple
|
||||
input splits per dataset.
|
||||
input splits per dataset. The large object file storage (see
|
||||
http://wiki.github.com/cloudera/sqoop/sip-3[SIP-3]) system's code
|
||||
lies in the +io+ package as well.
|
||||
|
||||
The +mapreduce+ package contains +DataDrivenImportJob+, which uses the
|
||||
+DataDrivenDBInputFormat+ introduced in 0.21. Most +ConnManager+
|
||||
implementations use +DataDrivenImportJob+ to perform their imports.
|
||||
The +mapreduce+ package contains code that interfaces directly with
|
||||
Hadoop MapReduce. This package's contents are described in more detail
|
||||
in the next section.
|
||||
|
||||
The +orm+ package contains code used for class generation. It depends on the
|
||||
JDK's tools.jar which provides the com.sun.tools.javac package.
|
||||
@ -237,3 +268,29 @@ and forward the data along to HDFS, possibly performing formatting conversions
|
||||
in the meantime.
|
||||
|
||||
|
||||
Interfacing with MapReduce
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Sqoop schedules MapReduce jobs to effect imports and exports.
|
||||
Configuration and execution of MapReduce jobs follows a few common
|
||||
steps (configuring the +InputFormat+; configuring the +OutputFormat+;
|
||||
setting the +Mapper+ implementation; etc...). These steps are
|
||||
formalized in the +org.apache.hadoop.sqoop.mapreduce.JobBase+ class.
|
||||
The +JobBase+ allows a user to specify the +InputFormat+,
|
||||
+OutputFormat+, and +Mapper+ to use.
|
||||
|
||||
+JobBase+ itself is subclassed by +ImportJobBase+ and +ExportJobBase+
|
||||
which offer better support for the particular configuration steps
|
||||
common to import or export-related jobs, respectively.
|
||||
+ImportJobBase.runImport()+ will call the configuration steps and run
|
||||
a job to import a table to HDFS.
|
||||
|
||||
Subclasses of these base classes exist as well. For example,
|
||||
+DataDrivenImportJob+ uses the +DataDrivenDBInputFormat+ to run an
|
||||
import. This is the most common type of import used by the various
|
||||
+ConnManager+ implementations available. MySQL uses a different class
|
||||
(+MySQLDumpImportJob+) to run a direct-mode import. Its custom
|
||||
+Mapper+ and +InputFormat+ implementations reside in this package as
|
||||
well.
|
||||
|
||||
|
32
src/docs/dev/compiling.txt
Normal file
32
src/docs/dev/compiling.txt
Normal file
@ -0,0 +1,32 @@
|
||||
|
||||
////
|
||||
Licensed to Cloudera, Inc. under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
////
|
||||
|
||||
|
||||
Compiling Sqoop from Source
|
||||
---------------------------
|
||||
|
||||
You can obtain the source code for Sqoop at:
|
||||
http://github.com/cloudera/sqoop
|
||||
|
||||
Sqoop source code is held in a +git+ repository. Instructions for
|
||||
retrieving source from the repository are provided at:
|
||||
http://wiki.github.com/cloudera/sqoop/DevelopmentProcess
|
||||
|
||||
Compilation instructions are provided in the +COMPILING.txt+ file in
|
||||
the root of the source repository.
|
||||
|
@ -20,15 +20,9 @@
|
||||
Introduction
|
||||
------------
|
||||
|
||||
Sqoop is a tool designed to help users of large data import
|
||||
existing relational databases into their Hadoop clusters. Sqoop uses
|
||||
JDBC to connect to a database, examine each table's schema, and
|
||||
auto-generate the necessary classes to import data into HDFS. It
|
||||
then instantiates a MapReduce job to read tables from the database
|
||||
via the DBInputFormat (JDBC-based InputFormat). Tables are read
|
||||
into a set of files loaded into HDFS. Both SequenceFile and
|
||||
text-based targets are supported. Sqoop also supports high-performance
|
||||
imports from select databases including MySQL.
|
||||
If you are a developer or an application programmer who intends to
|
||||
modify Sqoop or build an extension using one of Sqoop's internal APIs,
|
||||
you should read this document. The following sections describe the
|
||||
purpose of each API, where internal APIs are used, and which APIs are
|
||||
necessary for implementing support for additional databases.
|
||||
|
||||
This document describes how to get started using Sqoop to import
|
||||
your data into Hadoop.
|
55
src/docs/dev/preface.txt
Normal file
55
src/docs/dev/preface.txt
Normal file
@ -0,0 +1,55 @@
|
||||
|
||||
////
|
||||
Licensed to Cloudera, Inc. under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
////
|
||||
|
||||
|
||||
|
||||
Supported Releases
|
||||
------------------
|
||||
|
||||
This documentation applies to Sqoop v1.0.0 (June 2010).
|
||||
|
||||
Sqoop Releases
|
||||
--------------
|
||||
|
||||
Sqoop is an open source software product of Cloudera, Inc. Software
|
||||
development for Sqoop occurs at http://github.com/cloudera/sqoop. At
|
||||
that site, you can obtain:
|
||||
|
||||
- New releases of Sqoop as well as its most recent source code
|
||||
- An issue tracker
|
||||
- A wiki that contains Sqoop documentation
|
||||
|
||||
|
||||
Prerequisites
|
||||
-------------
|
||||
|
||||
The following prerequisite knowledge is required for Sqoop:
|
||||
|
||||
- Software development in Java
|
||||
* Familiarity with JDBC
|
||||
* Familiarity with Hadoop's APIs (including the "new" MapReduce API of
|
||||
0.20+)
|
||||
- Relational database management systems and SQL
|
||||
|
||||
This document assumes you are using a Linux or Linux-like environment.
|
||||
If you are using Windows, you may be able to use cygwin to accomplish
|
||||
most of the following tasks. If you are using Mac OS X, you should see
|
||||
few (if any) compatibility errors. Sqoop is predominantly operated and
|
||||
tested on Linux.
|
||||
|
||||
|
@ -1,77 +0,0 @@
|
||||
|
||||
////
|
||||
Licensed to Cloudera, Inc. under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
////
|
||||
|
||||
|
||||
Direct-mode Imports
|
||||
-------------------
|
||||
|
||||
While the JDBC-based import method used by Sqoop provides it with the
|
||||
ability to read from a variety of databases using a generic driver, it
|
||||
is not the most high-performance method available. Sqoop can read from
|
||||
certain database systems faster by using their built-in export tools.
|
||||
|
||||
For example, Sqoop can read from a MySQL database by using the +mysqldump+
|
||||
tool distributed with MySQL. You can take advantage of this faster
|
||||
import method by running Sqoop with the +--direct+ argument. This
|
||||
combined with a connect string that begins with +jdbc:mysql://+ will
|
||||
inform Sqoop that it should select the faster access method.
|
||||
|
||||
If your delimiters exactly match the delimiters used by +mysqldump+,
|
||||
then Sqoop will use a fast-path that copies the data directly from
|
||||
+mysqldump+'s output into HDFS. Otherwise, Sqoop will parse +mysqldump+'s
|
||||
output into fields and transcode them into the user-specified delimiter set.
|
||||
This incurs additional processing, so performance may suffer.
|
||||
For convenience, the +--mysql-delimiters+
|
||||
argument will set all the output delimiters to be consistent with
|
||||
+mysqldump+'s format.
|
||||
|
||||
Sqoop also provides a direct-mode backend for PostgreSQL that uses the
|
||||
+COPY TO STDOUT+ protocol from +psql+. No specific delimiter set provides
|
||||
better performance; Sqoop will forward delimiter control arguments to
|
||||
+psql+.
|
||||
|
||||
The "Supported Databases" section provides a full list of database vendors
|
||||
which have direct-mode support from Sqoop.
|
||||
|
||||
When writing to HDFS, direct mode will open a single output file to receive
|
||||
the results of the import. You can instruct Sqoop to use multiple output
|
||||
files by using the +--direct-split-size+ argument which takes a size in
|
||||
bytes. Sqoop will generate files of approximately this size. e.g.,
|
||||
+--direct-split-size 1000000+ will generate files of approximately 1 MB
|
||||
each. If compressing the HDFS files with +--compress+, this will allow
|
||||
subsequent MapReduce programs to use multiple mappers across your data
|
||||
in parallel.
|
||||
|
||||
Tool-specific arguments
|
||||
~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Sqoop will generate a set of command-line arguments with which it invokes
|
||||
the underlying direct-mode tool (e.g., mysqldump). You can specify additional
|
||||
arguments which should be passed to the tool by passing them to Sqoop
|
||||
after a single '+-+' argument. e.g.:
|
||||
|
||||
----
|
||||
$ sqoop --connect jdbc:mysql://localhost/db --table foo --direct - --lock-tables
|
||||
----
|
||||
|
||||
The +--lock-tables+ argument (and anything else to the right of the +-+ argument)
|
||||
will be passed directly to mysqldump.
|
||||
|
||||
|
||||
|
||||
|
@ -1,76 +0,0 @@
|
||||
|
||||
////
|
||||
Licensed to Cloudera, Inc. under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
////
|
||||
|
||||
|
||||
Exporting to a Database
|
||||
-----------------------
|
||||
|
||||
In addition to importing database tables into HDFS, Sqoop can also
|
||||
work in "reverse," reading the contents of a file or directory in
|
||||
HDFS, interpreting the data as database rows, and inserting them
|
||||
into a specified database table.
|
||||
|
||||
To run an export, invoke Sqoop with the +--export-dir+ and
|
||||
+--table+ options. e.g.:
|
||||
|
||||
----
|
||||
$ sqoop --connect jdbc:mysql://db.example.com/foo --table bar \
|
||||
--export-dir /results/bar_data
|
||||
----
|
||||
|
||||
This will take the files in +/results/bar_data+ and inject their
|
||||
contents in to the +bar+ table in the +foo+ database on +db.example.com+.
|
||||
The target table must already exist in the database. Sqoop will perform
|
||||
a set of +INSERT INTO+ operations, without regard for existing content. If
|
||||
Sqoop attempts to insert rows which violate constraints in the database
|
||||
(e.g., a particular primary key value already exists), then the export
|
||||
will fail.
|
||||
|
||||
As in import mode, Sqoop will auto-generate an interoperability class
|
||||
to use with the particular table in question. This will be used to parse
|
||||
the records in HDFS files before loading their contents into the database.
|
||||
You must specify the same delimiters (e.g., with +--fields-terminated-by+,
|
||||
etc.) as are used in the files to export in order to parse the data
|
||||
correctly. If your data is stored in SequenceFiles (created with an import
|
||||
in the +--as-sequencefile+ format), then you do not need to specify
|
||||
delimiters.
|
||||
|
||||
If you have an existing auto-generated jar and class that you intend to use
|
||||
with Sqoop, you can specify these with the +--jar-file+ and +--class-name+
|
||||
parameters. Providing these options will disable autogeneration of a new
|
||||
class based on the target table.
|
||||
|
||||
|
||||
Exporting to MySQL
|
||||
~~~~~~~~~~~~~~~~~~
|
||||
|
||||
MySQL supports a direct mode for exports. If the +--direct+ argument is given
|
||||
when exporting to a MySQL database, Sqoop will use instances of +mysqlimport+
|
||||
to manage the export process.
|
||||
|
||||
For performance, each writer will commit approximately every 32 MB of exported
|
||||
data. This can be controlled by passing the following argument _before_ any
|
||||
named parameters: +-D sqoop.mysql.export.checkpoint.bytes=_size_+, where _size_
|
||||
is a value in bytes. Setting _size_ to 0 will disable intermediate checkpoints,
|
||||
although individual files being exported will continue to be committed
|
||||
independently of one another.
|
||||
|
||||
IMPORTANT: Note that any arguments to Sqoop that are of the form
|
||||
+-D parameter=value+ must appear before any named arguments (e.g., +--connect+,
|
||||
+--table+, etc).
|
||||
|
@ -1,92 +0,0 @@
|
||||
|
||||
////
|
||||
Licensed to Cloudera, Inc. under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
////
|
||||
|
||||
|
||||
Automatic Full-database Import
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
If you want to import all the tables in a database, you can use the
|
||||
+--all-tables+ command to do so:
|
||||
|
||||
----
|
||||
$ sqoop --connect jdbc:mysql://database.example.com/employees --all-tables
|
||||
----
|
||||
|
||||
This will query the database for the available tables, generate an ORM
|
||||
class for each table, and run a MapReduce job to import each one.
|
||||
Hadoop uses the DBInputFormat to read from a database into a Mapper
|
||||
instance. To read a table into a MapReduce program requires creating a
|
||||
class to hold the fields of one row of the table. One of the benefits
|
||||
of Sqoop is that it generates this class definition for you, based on
|
||||
the table definition in the database.
|
||||
|
||||
The generated +.java+ files are, by default, placed in the current
|
||||
directory. You can supply a different directory with the +--outdir+
|
||||
parameter. These are then compiled into +.class+ and +.jar+ files for use
|
||||
by the MapReduce job that it launches. These files are created in a
|
||||
temporary directory. You can redirect this target with +--bindir+.
|
||||
|
||||
Each table will be imported into a separate directory in HDFS, with
|
||||
the same name as the table. For instance, if my Hadoop username is
|
||||
aaron, the above command would have generated the following
|
||||
directories in HDFS:
|
||||
|
||||
----
|
||||
/user/aaron/employee_names
|
||||
/user/aaron/payroll_checks
|
||||
/user/aaron/job_descriptions
|
||||
/user/aaron/office_supplies
|
||||
----
|
||||
|
||||
You can change the base directory under which the tables are loaded
|
||||
with the +--warehouse-dir+ parameter. For example:
|
||||
|
||||
----
|
||||
$ sqoop --connect jdbc:mysql://database.example.com/employees --all-tables \
|
||||
--warehouse-dir /common/warehouse
|
||||
----
|
||||
|
||||
This would create the following directories instead:
|
||||
|
||||
----
|
||||
/common/warehouse/employee_names
|
||||
/common/warehouse/payroll_checks
|
||||
/common/warehouse/job_descriptions
|
||||
/common/warehouse/office_supplies
|
||||
----
|
||||
|
||||
By default the data will be read into text files in HDFS. Each of the
|
||||
columns will be represented as comma-delimited text. Each row is
|
||||
terminated by a newline. See the section on "Controlling the Output
|
||||
Format" below for information on how to change these delimiters.
|
||||
|
||||
If you want to leverage compression and binary file formats, the
|
||||
+--as-sequencefile+ argument to Sqoop will import the table
|
||||
to a set of SequenceFiles instead. This stores each field of each
|
||||
database record in a separate object in a SequenceFile.
|
||||
This representation is also likely to be higher performance when used
|
||||
as an input to subsequent MapReduce programs as it does not require
|
||||
parsing. For completeness, Sqoop provides an +--as-textfile+ option, which is
|
||||
implied by default. An +--as-textfile+ on the command-line will override
|
||||
a previous +--as-sequencefile+ argument.
|
||||
|
||||
The SequenceFile format will embed the records from the database as
|
||||
objects using the code generated by Sqoop. It is important that you
|
||||
retain the +.java+ file for this class, as you will need to be able to
|
||||
instantiate the same type to read the objects back later, in other
|
||||
user-defined applications.
|
||||
|
@ -1,72 +0,0 @@
|
||||
|
||||
////
|
||||
Licensed to Cloudera, Inc. under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
////
|
||||
|
||||
|
||||
Importing Data Into Hive
|
||||
------------------------
|
||||
|
||||
Sqoop's primary function is to upload your data into files in HDFS. If
|
||||
you have a Hive metastore associated with your HDFS cluster, Sqoop can
|
||||
also import the data into Hive by generating and executing a +CREATE
|
||||
TABLE+ statement to define the data's layout in Hive. Importing data
|
||||
into Hive is as simple as adding the *+--hive-import+* option to your
|
||||
Sqoop command line.
|
||||
|
||||
By default the data is imported into HDFS, but you can skip this operation
|
||||
by using the *+--hive-create+* option. Optionally, you can specify the
|
||||
*+--hive-overwrite+* option to indicate that existing table in hive must
|
||||
be replaced. After your data is imported into HDFS or this step is
|
||||
omitted, Sqoop will generate a Hive script containing a +CREATE TABLE+
|
||||
operation defining your columns using Hive's types, and a +LOAD DATA INPATH+
|
||||
statement to move the data files into Hive's warehouse directory if
|
||||
*+--hive-create+* option is not added. The script will be executed by calling
|
||||
the installed copy of hive on the machine where Sqoop is run. If you have
|
||||
multiple Hive installations, or +hive+ is not in your +$PATH+ use the
|
||||
*+--hive-home+* option to identify the Hive installation directory.
|
||||
Sqoop will use +$HIVE_HOME/bin/hive+ from here.
|
||||
|
||||
NOTE: This function is incompatible with +--as-sequencefile+.
|
||||
|
||||
Hive's text parser does not know how to support escaping or enclosing
|
||||
characters. Sqoop will print a warning if you use +--escaped-by+,
|
||||
+--enclosed-by+, or +--optionally-enclosed-by+ since Hive does not know
|
||||
how to parse these. It will pass the field and record terminators through
|
||||
to Hive. If you do not set any delimiters and do use +--hive-import+,
|
||||
the field delimiter will be set to +^A+ and the record delimiter will
|
||||
be set to +\n+ to be consistent with Hive's defaults.
|
||||
|
||||
The table name used in Hive is, by default, the same as that of the
|
||||
source table. You can control the output table name with the +--hive-table+
|
||||
option.
|
||||
|
||||
If Hive import commands are used in conjunction with the +--generate-only+
|
||||
option, then a Hive import will not occur. Instead, the DDL commands to
|
||||
perform the import from HDFS to Hive are written to a file named +_tableName_.q+
|
||||
which you can then execute with +hive -f+ after the data is brought into
|
||||
HDFS.
|
||||
|
||||
Hive's Type System
|
||||
~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Hive users will note that there is not a one-to-one mapping between
|
||||
SQL types and Hive types. In general, SQL types that do not have a
|
||||
direct mapping (e.g., +DATE+, +TIME+, and +TIMESTAMP+) will be coerced to
|
||||
+STRING+ in Hive. The +NUMERIC+ and +DECIMAL+ SQL types will be coerced to
|
||||
+DOUBLE+. In these cases, Sqoop will emit a warning in its log messages
|
||||
informing you of the loss of precision.
|
||||
|
@ -1,48 +0,0 @@
|
||||
|
||||
////
|
||||
Licensed to Cloudera, Inc. under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
////
|
||||
|
||||
|
||||
Miscellaneous Additional Arguments
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
If you want to generate the Java classes to represent tables without
|
||||
actually performing an import, supply a connect string and
|
||||
(optionally) credentials as above, as well as +--all-tables+ or
|
||||
+--table+, but also use the +--generate-only+ argument. This will
|
||||
generate the classes and cease further operation.
|
||||
|
||||
You can override the +$HADOOP_HOME+ environment variable within Sqoop
|
||||
with the +--hadoop-home+ argument. You can override the +$HIVE_HOME+
|
||||
environment variable with +--hive-home+.
|
||||
|
||||
Data emitted to HDFS is by default uncompressed. You can instruct
|
||||
Sqoop to use gzip to compress your data by providing either the
|
||||
+--compress+ or +-z+ argument (both are equivalent).
|
||||
|
||||
Small CLOB and BLOB values will be imported as string-based data inline
|
||||
with the rest of their containing record. Over a size threshold (by
|
||||
default, 16 MB per object), these values will not be materialized directly,
|
||||
inline, and will be written to external files in HDFS; the inline records
|
||||
will contain pointers to these files. The inline materialization limit can
|
||||
be controlled with the +--inline-lob-limit+ argument; the limit itself is
|
||||
specified in bytes.
|
||||
|
||||
Using +--verbose+ will instruct Sqoop to print more details about its
|
||||
operation; this is particularly handy if Sqoop appears to be misbehaving.
|
||||
|
||||
|
@ -1,55 +0,0 @@
|
||||
|
||||
////
|
||||
Licensed to Cloudera, Inc. under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
////
|
||||
|
||||
|
||||
Supported Databases
|
||||
-------------------
|
||||
|
||||
Sqoop uses JDBC to connect to databases. JDBC is a compatibility layer
|
||||
that allows a program to access many different databases through a common
|
||||
API. Slight differences in the SQL language spoken by each database, however,
|
||||
may mean that Sqoop can't use every database out of the box, or that some
|
||||
databases may be used in an inefficient manner.
|
||||
|
||||
When you provide a connect string to Sqoop, it inspects the protocol scheme to
|
||||
determine appropriate vendor-specific logic to use. If Sqoop knows about
|
||||
a given database, it will work automatically. If not, you may need to
|
||||
specify the driver class to load via +--driver+. This will use a generic
|
||||
code path which will use standard SQL to access the database. Sqoop provides
|
||||
some databases with faster, non-JDBC-based access mechanisms. These can be
|
||||
enabled by specfying the +--direct+ parameter.
|
||||
|
||||
Sqoop includes vendor-specific code paths for the following databases:
|
||||
|
||||
[grid="all"]
|
||||
`-----------`--------`--------------------`---------------------
|
||||
Database version +--direct+ support? connect string matches
|
||||
----------------------------------------------------------------
|
||||
HSQLDB 1.8.0+ No +jdbc:hsqldb:*//+
|
||||
MySQL 5.0+ Yes +jdbc:mysql://+
|
||||
Oracle 10.2.0+ No +jdbc:oracle:*//+
|
||||
PostgreSQL 8.3+ Yes (import only) +jdbc:postgresql://+
|
||||
----------------------------------------------------------------
|
||||
|
||||
Sqoop may work with older versions of the databases listed, but we have
|
||||
only tested it with the versions specified above.
|
||||
|
||||
Even if Sqoop supports a database internally, you may still need to
|
||||
install the database vendor's JDBC driver in your +$HADOOP_HOME/lib+
|
||||
path.
|
||||
|
@ -1,68 +0,0 @@
|
||||
|
||||
////
|
||||
Licensed to Cloudera, Inc. under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
////
|
||||
|
||||
|
||||
Importing Individual Tables
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
In addition to full-database imports, Sqoop will allow you to import
|
||||
individual tables. Instead of using +--all-tables+, specify the name of
|
||||
a particular table with the +--table+ argument:
|
||||
|
||||
----
|
||||
$ sqoop --connect jdbc:mysql://database.example.com/employees \
|
||||
--table employee_names
|
||||
----
|
||||
|
||||
You can further specify a subset of the columns in a table by using
|
||||
the +--columns+ argument. This takes a list of column names, delimited
|
||||
by commas, with no spaces in between. e.g.:
|
||||
|
||||
----
|
||||
$ sqoop --connect jdbc:mysql://database.example.com/employees \
|
||||
--table employee_names --columns employee_id,first_name,last_name,dept_id
|
||||
----
|
||||
|
||||
Sqoop will use a MapReduce job to read sections of the table in
|
||||
parallel. For the MapReduce tasks to divide the table space, the
|
||||
results returned by the database must be orderable. Sqoop will
|
||||
automatically detect the primary key for a table and use that to order
|
||||
the results. If no primary key is available, or (less likely) you want
|
||||
to order the results along a different column, you can specify the
|
||||
column name with +--split-by+.
|
||||
|
||||
.Row ordering
|
||||
IMPORTANT: To guarantee correctness of your input, you must select an
|
||||
ordering column for which each row has a unique value. If duplicate
|
||||
values appear in the ordering column, the results of the import are
|
||||
undefined, and Sqoop will not be able to detect the error.
|
||||
|
||||
Finally, you can control which rows of a table are imported via the
|
||||
+--where+ argument. With this argument, you may specify a clause to be
|
||||
appended to the SQL statement used to select rows from the table,
|
||||
e.g.:
|
||||
|
||||
----
|
||||
$ sqoop --connect jdbc:mysql://database.example.com/employees \
|
||||
--table employee_names --where "employee_id > 40 AND active = 1"
|
||||
----
|
||||
|
||||
The +--columns+, +--split-by+, and +--where+ arguments are incompatible with
|
||||
+--all-tables+. If you require special handling for some of the tables,
|
||||
then you must manually run a separate import job for each table.
|
||||
|
52
src/docs/user/SqoopUserGuide.txt
Normal file
52
src/docs/user/SqoopUserGuide.txt
Normal file
@ -0,0 +1,52 @@
|
||||
|
||||
////
|
||||
Licensed to Cloudera, Inc. under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
////
|
||||
|
||||
|
||||
include::intro.txt[]
|
||||
|
||||
include::preface.txt[]
|
||||
|
||||
include::basics.txt[]
|
||||
|
||||
include::tools.txt[]
|
||||
|
||||
include::import.txt[]
|
||||
|
||||
include::import-all-tables.txt[]
|
||||
|
||||
include::export.txt[]
|
||||
|
||||
include::codegen.txt[]
|
||||
|
||||
include::create-hive-table.txt[]
|
||||
|
||||
include::eval.txt[]
|
||||
|
||||
include::list-databases.txt[]
|
||||
|
||||
include::list-tables.txt[]
|
||||
|
||||
include::help.txt[]
|
||||
|
||||
include::version.txt[]
|
||||
|
||||
include::compatibility.txt[]
|
||||
|
||||
include::support.txt[]
|
||||
|
||||
|
63
src/docs/user/basics.txt
Normal file
63
src/docs/user/basics.txt
Normal file
@ -0,0 +1,63 @@
|
||||
|
||||
////
|
||||
Licensed to Cloudera, Inc. under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
////
|
||||
|
||||
Basic Usage
|
||||
-----------
|
||||
|
||||
With Sqoop, you can _import_ data from a relational database system into
|
||||
HDFS. The input to the import process is a database table. Sqoop
|
||||
will read the table row-by-row into HDFS. The output of this import
|
||||
process is a set of files containing a copy of the imported table.
|
||||
The import process is performed in parallel. For this reason, the
|
||||
output will be in multiple files. These files may be delimited text
|
||||
files (for example, with commas or tabs separating each field), or
|
||||
binary SequenceFiles containing serialized record data.
|
||||
|
||||
A by-product of the import process is a generated Java class which
|
||||
can encapsulate one row of the imported table. This class is used
|
||||
during the import process by Sqoop itself. The Java source code for
|
||||
this class is also provided to you, for use in subsequent MapReduce
|
||||
processing of the data. This class can serialize and deserialize data
|
||||
to and from the SequenceFile format. It can also parse the
|
||||
delimited-text form of a record. These abilities allow you to quickly
|
||||
develop MapReduce applications that use the HDFS-stored records in
|
||||
your processing pipeline. You are also free to parse the delimiteds
|
||||
record data yourself, using any other tools you prefer.
|
||||
|
||||
After manipulating the imported records (for example, with MapReduce
|
||||
or Hive) you may have a result data set which you can then _export_
|
||||
back to the relational database. Sqoop's export process will read
|
||||
a set of delimited text files from HDFS in parallel, parse them into
|
||||
records, and insert them as new rows in a target database table, for
|
||||
consumption by external applications or users.
|
||||
|
||||
Sqoop includes some other commands which allow you to inspect the
|
||||
database you are working with. For example, you can list the available
|
||||
database schemas (with the +sqoop-list-databases+ tool) and tables
|
||||
within a schema (with the +sqoop-list-tables+ tool). Sqoop also
|
||||
includes a primitive SQL execution shell (the +sqoop-eval+ tool).
|
||||
|
||||
Most aspects of the import, code generation, and export processes can
|
||||
be customized. You can control the specific row range or columns imported.
|
||||
You can specify particular delimiters and escape characters for the
|
||||
file-based representation of the data, as well as the file format
|
||||
used. You can also control the class or package names used in
|
||||
generated code. Subsequent sections of this document explain how to
|
||||
specify these and other arguments to Sqoop.
|
||||
|
||||
|
33
src/docs/user/codegen-args.txt
Normal file
33
src/docs/user/codegen-args.txt
Normal file
@ -0,0 +1,33 @@
|
||||
|
||||
////
|
||||
Licensed to Cloudera, Inc. under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
////
|
||||
|
||||
|
||||
.Code generation arguments:
|
||||
[grid="all"]
|
||||
`------------------------`-----------------------------------------------
|
||||
Argument Description
|
||||
-------------------------------------------------------------------------
|
||||
+\--bindir <dir>+ Output directory for compiled objects
|
||||
+\--class-name <name>+ Sets the generated class name. This overrides\
|
||||
+\--package-name+. When combined with \
|
||||
+\--jar-file+, sets the input class.
|
||||
+\--jar-file <file>+ Disable code generation; use specified jar
|
||||
+\--outdir <dir>+ Output directory for generated code
|
||||
+\--package-name <name>+ Put auto-generated classes in this package
|
||||
-------------------------------------------------------------------------
|
||||
|
83
src/docs/user/codegen.txt
Normal file
83
src/docs/user/codegen.txt
Normal file
@ -0,0 +1,83 @@
|
||||
|
||||
////
|
||||
Licensed to Cloudera, Inc. under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
////
|
||||
|
||||
|
||||
+sqoop-codegen+
|
||||
---------------
|
||||
|
||||
|
||||
Purpose
|
||||
~~~~~~~
|
||||
|
||||
The +codegen+ tool generates Java classes which encapsulate and
|
||||
interpret imported records. The Java definition of a record is
|
||||
instantiated as part of the import process, but can also be performed
|
||||
separately. For example, if Java source is lost, it can be recreated.
|
||||
New versions of a class can be created which use different delimiters
|
||||
between fields, and so on.
|
||||
|
||||
Syntax
|
||||
~~~~~~
|
||||
|
||||
----
|
||||
$ sqoop codegen (generic-args) (codegen-args)
|
||||
$ sqoop-codegen (generic-args) (codegen-args)
|
||||
----
|
||||
|
||||
Although the Hadoop generic arguments must preceed any codegen arguments,
|
||||
the codegen arguments can be entered in any order with respect to one
|
||||
another.
|
||||
|
||||
|
||||
include::common-args.txt[]
|
||||
|
||||
.Code generation arguments:
|
||||
[grid="all"]
|
||||
`------------------------`-----------------------------------------------
|
||||
Argument Description
|
||||
-------------------------------------------------------------------------
|
||||
+\--bindir <dir>+ Output directory for compiled objects
|
||||
+\--class-name <name>+ Sets the generated class name. This overrides\
|
||||
+\--package-name+.
|
||||
+\--outdir <dir>+ Output directory for generated code
|
||||
+\--package-name <name>+ Put auto-generated classes in this package
|
||||
+\--table <table-name>+ Name of the table to generate code for.
|
||||
-------------------------------------------------------------------------
|
||||
|
||||
include::output-args.txt[]
|
||||
|
||||
include::input-args.txt[]
|
||||
|
||||
include::hive-args.txt[]
|
||||
|
||||
If Hive arguments are provided to the code generation tool, Sqoop
|
||||
generates a file containing the HQL statements to create a table and
|
||||
load data.
|
||||
|
||||
Example Invocations
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Recreate the record interpretation code for the +employees+ table of a
|
||||
corporate database:
|
||||
|
||||
----
|
||||
$ sqoop codegen --connect jdbc:mysql://db.example.com/corp \
|
||||
--table employees
|
||||
----
|
||||
|
||||
|
33
src/docs/user/common-args.txt
Normal file
33
src/docs/user/common-args.txt
Normal file
@ -0,0 +1,33 @@
|
||||
|
||||
////
|
||||
Licensed to Cloudera, Inc. under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
////
|
||||
|
||||
.Common arguments
|
||||
[grid="all"]
|
||||
`-------------------------`------------------------------------------
|
||||
Argument Description
|
||||
---------------------------------------------------------------------
|
||||
+\--connect <jdbc-uri>+ Specify JDBC connect string
|
||||
+\--driver <class-name>+ Manually specify JDBC driver class to use
|
||||
+\--hadoop-home <dir>+ Override $HADOOP_HOME
|
||||
+\--help+ Print usage instructions
|
||||
+-P+ Read password from console
|
||||
+\--password <password>+ Set authentication password
|
||||
+\--username <username>+ Set authentication username
|
||||
+\--verbose+ Print more information while working
|
||||
---------------------------------------------------------------------
|
||||
|
184
src/docs/user/compatibility.txt
Normal file
184
src/docs/user/compatibility.txt
Normal file
@ -0,0 +1,184 @@
|
||||
|
||||
////
|
||||
Licensed to Cloudera, Inc. under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
////
|
||||
|
||||
Compatibility Notes
|
||||
-------------------
|
||||
|
||||
Sqoop uses JDBC to connect to databases and adheres to
|
||||
published standards as much as possible. For databases which do not
|
||||
support standards-compliant SQL, Sqoop uses alternate codepaths to
|
||||
provide functionality. In general, Sqoop is believed to be compatible
|
||||
with a large number of databases, but it is tested with only a few.
|
||||
|
||||
Nonetheless, several database-specific decisions were made in the
|
||||
implementation of Sqoop, and some databases offer additional settings
|
||||
which are extensions to the standard.
|
||||
|
||||
This section describes the databases tested with Sqoop, any
|
||||
exceptions in Sqoop's handling of each database relative to the
|
||||
norm, and any database-specific settings available in Sqoop.
|
||||
|
||||
Supported Databases
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
While JDBC is a compatibility layer that allows a program to access
|
||||
many different databases through a common API, slight differences in
|
||||
the SQL language spoken by each database may mean that Sqoop can't use
|
||||
every database out of the box, or that some databases may be used in
|
||||
an inefficient manner.
|
||||
|
||||
When you provide a connect string to Sqoop, it inspects the protocol scheme to
|
||||
determine appropriate vendor-specific logic to use. If Sqoop knows about
|
||||
a given database, it will work automatically. If not, you may need to
|
||||
specify the driver class to load via +\--driver+. This will use a generic
|
||||
code path which will use standard SQL to access the database. Sqoop provides
|
||||
some databases with faster, non-JDBC-based access mechanisms. These can be
|
||||
enabled by specfying the +\--direct+ parameter.
|
||||
|
||||
Sqoop includes vendor-specific support for the following databases:
|
||||
|
||||
[grid="all"]
|
||||
`-----------`--------`--------------------`---------------------
|
||||
Database version +\--direct+ support? connect string matches
|
||||
----------------------------------------------------------------
|
||||
HSQLDB 1.8.0+ No +jdbc:hsqldb:*//+
|
||||
MySQL 5.0+ Yes +jdbc:mysql://+
|
||||
Oracle 10.2.0+ No +jdbc:oracle:*//+
|
||||
PostgreSQL 8.3+ Yes (import only) +jdbc:postgresql://+
|
||||
----------------------------------------------------------------
|
||||
|
||||
Sqoop may work with older versions of the databases listed, but we have
|
||||
only tested it with the versions specified above.
|
||||
|
||||
Even if Sqoop supports a database internally, you may still need to
|
||||
install the database vendor's JDBC driver in your +$HADOOP_HOME/lib+
|
||||
path.
|
||||
|
||||
MySQL
|
||||
~~~~~
|
||||
|
||||
JDBC Driver: http://www.mysql.com/downloads/connector/j/[MySQL
|
||||
Connector/J]
|
||||
|
||||
MySQL v5.0 and above offers very thorough coverage by Sqoop. In builds
|
||||
of Sqoop included with Cloudera's Distribution for Hadoop, the
|
||||
Connector/J JDBC driver is included with the installation.
|
||||
|
||||
zeroDateTimeBehavior
|
||||
^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
MySQL allows values of +'0000-00-00'+ for +DATE+ columns, which is a
|
||||
non-standard extension to SQL. When communicated via JDBC, these
|
||||
values are handled in one of three different ways:
|
||||
|
||||
- Convert to +NULL+.
|
||||
- Throw an exception in the client.
|
||||
- Round to the nearest legal date (+'0001-01-01'+).
|
||||
|
||||
You specify the behavior by using the +zeroDateTimeBehavior+
|
||||
property of the connect string. If a +zeroDateTimeBehavior+ property
|
||||
is not specified, Sqoop uses the +convertToNull+ behavior.
|
||||
|
||||
You can override this behavior. For example:
|
||||
|
||||
----
|
||||
$ sqoop import --table foo \
|
||||
--connect jdbc:mysql://db.example.com/someDb?zeroDateTimeBehavior=round
|
||||
----
|
||||
|
||||
+UNSIGNED+ columns
|
||||
^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Columns with type +UNSIGNED+ in MySQL can hold values between 0 and
|
||||
2^32 (+4294967295+), but the database will report the data type to Sqoop
|
||||
as +INTEGER+, which will can hold values between +-2147483648+ and
|
||||
+\+2147483647+. Sqoop cannot currently import +UNSIGNED+ values above
|
||||
+2147483647+.
|
||||
|
||||
+BLOB+ and +CLOB+ columns
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Sqoop's direct mode does not support imports of +BLOB+, +CLOB+, or
|
||||
+LONGVARBINARY+ columns. Use JDBC-based imports for these
|
||||
columns; do not supply the +\--direct+ argument to the import tool.
|
||||
|
||||
|
||||
Direct-mode Transactions
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
For performance, each writer will commit the current transaction
|
||||
approximately every 32 MB of exported data. You can control this
|
||||
by specifying the following argument _before_ any tool-specific arguments: +-D
|
||||
sqoop.mysql.export.checkpoint.bytes=size+, where _size_ is a value in
|
||||
bytes. Set _size_ to 0 to disable intermediate checkpoints,
|
||||
but individual files being exported will continue to be committed
|
||||
independently of one another.
|
||||
|
||||
IMPORTANT: Note that any arguments to Sqoop that are of the form +-D
|
||||
parameter=value+ are Hadoop _generic arguments_ and must appear before
|
||||
any tool-specific arguments (for example, +\--connect+, +\--table+, etc).
|
||||
|
||||
|
||||
Oracle
|
||||
~~~~~~
|
||||
|
||||
JDBC Driver:
|
||||
http://www.oracle.com/technology/software/tech/java/sqlj_jdbc/htdocs/jdbc_112010.html[Oracle
|
||||
JDBC Thin Driver] - Sqoop is compatible with +ojdbc6.jar+.
|
||||
|
||||
Sqoop has been tested with Oracle 10.2.0 Express Edition. Oracle is
|
||||
notable in its different approach to SQL from the ANSI standard, and
|
||||
its non-standard JDBC driver. Therefore, several features work
|
||||
differently.
|
||||
|
||||
Dates and Times
|
||||
^^^^^^^^^^^^^^^
|
||||
|
||||
Oracle JDBC represents +DATE+ and +TIME+ SQL types as +TIMESTAMP+
|
||||
values. Any +DATE+ columns in an Oracle database will be imported as a
|
||||
+TIMESTAMP+ in Sqoop, and Sqoop-generated code will store these values
|
||||
in +java.sql.Timestamp+ fields.
|
||||
|
||||
When exporting data back to a database, Sqoop parses text fields as
|
||||
+TIMESTAMP+ types (with the form +yyyy-mm-dd HH:MM:SS.ffffffff+) even
|
||||
if you expect these fields to be formatted with the JDBC date escape
|
||||
format of +yyyy-mm-dd+. Dates exported to Oracle should be formatted
|
||||
as full timestamps.
|
||||
|
||||
Oracle also includes the additional date/time types +TIMESTAMP WITH
|
||||
TIMEZONE+ and +TIMESTAMP WITH LOCAL TIMEZONE+. To support these types,
|
||||
the user's session timezone must be specified. By default, Sqoop will
|
||||
specify the timezone +"GMT"+ to Oracle. You can override this setting
|
||||
by specifying a Hadoop property +oracle.sessionTimeZone+ on the
|
||||
command-line when running a Sqoop job. For example:
|
||||
|
||||
----
|
||||
$ sqoop import -D oracle.sessionTimeZone=America/Los_Angeles \
|
||||
--connect jdbc:oracle:thin:@//db.example.com/foo --table bar
|
||||
----
|
||||
|
||||
Note that Hadoop parameters (+-D ...+) are _generic arguments_ and
|
||||
must appear before the tool-specific arguments (+\--connect+,
|
||||
+\--table+, and so on).
|
||||
|
||||
Legal values for the session timezone string are enumerated at
|
||||
http://download-west.oracle.com/docs/cd/B19306_01/server.102/b14225/applocaledata.htm#i637736[].
|
||||
|
||||
|
||||
include::hive-notes.txt[]
|
||||
|
@ -18,68 +18,70 @@
|
||||
|
||||
|
||||
Connecting to a Database Server
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Sqoop is designed to import tables from a database into HDFS. As such,
|
||||
it requires a _connect string_ that describes how to connect to the
|
||||
database. The _connect string_ looks like a URL, and is communicated to
|
||||
Sqoop with the +--connect+ argument. This describes the server and
|
||||
database to connect to; it may also specify the port. e.g.:
|
||||
Sqoop is designed to import tables from a database into HDFS. To do
|
||||
so, you must specify a _connect string_ that describes how to connect to the
|
||||
database. The _connect string_ is similar to a URL, and is communicated to
|
||||
Sqoop with the +\--connect+ argument. This describes the server and
|
||||
database to connect to; it may also specify the port. For example:
|
||||
|
||||
----
|
||||
$ sqoop --connect jdbc:mysql://database.example.com/employees
|
||||
$ sqoop import --connect jdbc:mysql://database.example.com/employees
|
||||
----
|
||||
|
||||
This string will connect to a MySQL database named +employees+ on the
|
||||
host +database.example.com+. It's important that you *do not* use the URL
|
||||
+localhost+ if you intend to use Sqoop with a distributed Hadoop
|
||||
cluster. The connect string you supply will be used on TaskTracker nodes
|
||||
throughout your MapReduce cluster; if they're told to connect to the
|
||||
literal name +localhost+, they'll each reach a different
|
||||
database (or more likely, no database at all)! Instead, you should use
|
||||
throughout your MapReduce cluster; if you specify the
|
||||
literal name +localhost+, each node will connect to a different
|
||||
database (or more likely, no database at all). Instead, you should use
|
||||
the full hostname or IP address of the database host that can be seen
|
||||
by all your remote nodes.
|
||||
|
||||
You may need to authenticate against the database before you can
|
||||
access it. The +--username+ and +--password+ or +-P+ parameters can
|
||||
be used to supply a username and a password to the database. e.g.:
|
||||
You might need to authenticate against the database before you can
|
||||
access it. You can use the +\--username+ and +\--password+ or +-P+ parameters
|
||||
to supply a username and a password to the database. For example:
|
||||
|
||||
----
|
||||
$ sqoop --connect jdbc:mysql://database.example.com/employees \
|
||||
$ sqoop import --connect jdbc:mysql://database.example.com/employees \
|
||||
--username aaron --password 12345
|
||||
----
|
||||
|
||||
.Password security
|
||||
WARNING: The +--password+ parameter is insecure, as other users may
|
||||
WARNING: The +\--password+ parameter is insecure, as other users may
|
||||
be able to read your password from the command-line arguments via
|
||||
the output of programs such as `ps`. The *+-P+* argument will read
|
||||
a password from a console prompt, and is the preferred method of
|
||||
entering credentials. Credentials may still be transferred between
|
||||
nodes of the MapReduce cluster using insecure means.
|
||||
|
||||
Sqoop automatically supports several databases, including MySQL. Connect strings beginning
|
||||
with +jdbc:mysql://+ are handled automatically Sqoop, though you may need
|
||||
to install the driver yourself. (A full list of databases with
|
||||
built-in support is provided in the "Supported Databases" section, below.)
|
||||
Sqoop automatically supports several databases, including MySQL.
|
||||
Connect strings beginning with +jdbc:mysql://+ are handled
|
||||
automatically in Sqoop, though you may need to install the driver
|
||||
yourself. (A full list of databases with built-in support is provided
|
||||
in the "Supported Databases" section.)
|
||||
|
||||
You can use Sqoop with any other
|
||||
JDBC-compliant database as well. First, download the appropriate JDBC
|
||||
driver for the database you want to import from, and install the .jar
|
||||
JDBC-compliant database. First, download the appropriate JDBC
|
||||
driver for the type of database you want to import, and install the .jar
|
||||
file in the +/usr/hadoop/lib+ directory on all machines in your Hadoop
|
||||
cluster, or some other directory which is in the classpath
|
||||
on all nodes. Each driver jar also has a specific driver class which defines
|
||||
on all nodes. Each driver +.jar+ file also has a specific driver class which defines
|
||||
the entry-point to the driver. For example, MySQL's Connector/J library has
|
||||
a driver class of +com.mysql.jdbc.Driver+. Refer to your database
|
||||
vendor-specific documentation to determine the main driver class.
|
||||
This class must be provided as an argument to Sqoop with +--driver+.
|
||||
This class must be provided as an argument to Sqoop with +\--driver+.
|
||||
|
||||
For example, to connect to a postgres database, first download the driver from
|
||||
link:http://jdbc.postgresql.org[http://jdbc.postgresql.org] and
|
||||
install it in your Hadoop lib path.
|
||||
Then run Sqoop with something like:
|
||||
For example, to connect to a SQLServer database, first download the driver from
|
||||
microsoft.com and install it in your Hadoop lib path.
|
||||
|
||||
Then run Sqoop. For example:
|
||||
|
||||
----
|
||||
$ sqoop --connect jdbc:postgresql://postgres-server.example.com/employees \
|
||||
--driver org.postgresql.Driver
|
||||
$ sqoop import --driver com.microsoft.jdbc.sqlserver.SQLServerDriver \
|
||||
--connect <connect-string> ...
|
||||
----
|
||||
|
||||
|
@ -29,14 +29,14 @@ include::input-formatting-args.txt[]
|
||||
|
||||
If you have already imported data into HDFS in a text-based
|
||||
representation and want to change the delimiters being used, you
|
||||
should regenerate the class via `sqoop --generate-only`, specifying
|
||||
the new delimiters with +--fields-terminated-by+, etc., and the old
|
||||
delimiters with +--input-fields-terminated-by+, etc. Then run a
|
||||
should regenerate the class via `sqoop \--generate-only`, specifying
|
||||
the new delimiters with +\--fields-terminated-by+, etc., and the old
|
||||
delimiters with +\--input-fields-terminated-by+, etc. Then run a
|
||||
MapReduce job where your mapper creates an instance of your record
|
||||
class, uses its +parse()+ method to read the fields using the old
|
||||
delimiters, and emits a new +Text+ output value via the record's
|
||||
+toString()+ method, which will use the new delimiters. You'll then
|
||||
want to regenerate the class another time without the
|
||||
+--input-fields-terminated-by+ specified so that the new delimiters
|
||||
+\--input-fields-terminated-by+ specified so that the new delimiters
|
||||
are used for both input and output.
|
||||
|
82
src/docs/user/create-hive-table.txt
Normal file
82
src/docs/user/create-hive-table.txt
Normal file
@ -0,0 +1,82 @@
|
||||
|
||||
////
|
||||
Licensed to Cloudera, Inc. under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
////
|
||||
|
||||
|
||||
+sqoop-create-hive-table+
|
||||
-------------------------
|
||||
|
||||
|
||||
Purpose
|
||||
~~~~~~~
|
||||
|
||||
The +create-hive-table+ tool populates a Hive metastore with a
|
||||
definition for a table based on a database table previously imported
|
||||
to HDFS, or one planned to be imported. This effectively performs the
|
||||
"+\--hive-import+" step of +sqoop-import+ without running the
|
||||
preceeding import.
|
||||
|
||||
If data was already loaded to HDFS, you can use this tool to finish
|
||||
the pipeline of importing the data to Hive. You can also create Hive tables
|
||||
with this tool; data then can be imported and populated into
|
||||
the target after a preprocessing step run by the user.
|
||||
|
||||
Syntax
|
||||
~~~~~~
|
||||
|
||||
----
|
||||
$ sqoop create-hive-table (generic-args) (create-hive-table-args)
|
||||
$ sqoop-create-hive-table (generic-args) (create-hive-table-args)
|
||||
----
|
||||
|
||||
Although the Hadoop generic arguments must preceed any create-hive-table
|
||||
arguments, the create-hive-table arguments can be entered in any order
|
||||
with respect to one another.
|
||||
|
||||
|
||||
include::common-args.txt[]
|
||||
|
||||
.Hive arguments:
|
||||
[grid="all"]
|
||||
`-----------------------------`-------------------------------------------
|
||||
Argument Description
|
||||
--------------------------------------------------------------------------
|
||||
+\--hive-home <dir>+ Override +$HIVE_HOME+
|
||||
+\--hive-overwrite+ Overwrite existing data in the Hive table.
|
||||
+\--hive-table <table-name>+ Sets the table name to use when importing \
|
||||
to Hive.
|
||||
+\--table+ The database table to read the \
|
||||
definition from.
|
||||
--------------------------------------------------------------------------
|
||||
|
||||
include::output-args.txt[]
|
||||
|
||||
Do not use enclosed-by or escaped-by delimiters with output formatting
|
||||
arguments used to import to Hive. Hive cannot currently parse them.
|
||||
|
||||
Example Invocations
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Define in Hive a table named +emps+ with a definition based on a
|
||||
database table named +employees+:
|
||||
|
||||
----
|
||||
$ sqoop create-hive-table --connect jdbc:mysql://db.example.com/corp \
|
||||
--table employees --hive-table emps
|
||||
----
|
||||
|
||||
|
65
src/docs/user/eval.txt
Normal file
65
src/docs/user/eval.txt
Normal file
@ -0,0 +1,65 @@
|
||||
|
||||
////
|
||||
Licensed to Cloudera, Inc. under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
////
|
||||
|
||||
|
||||
+sqoop-eval+
|
||||
------------
|
||||
|
||||
|
||||
Purpose
|
||||
~~~~~~~
|
||||
|
||||
The +eval+ tool allows users to quickly run simple SQL queries against
|
||||
a database; results are printed to the console. This allows users to
|
||||
preview their import queries to ensure they import the data they
|
||||
expect.
|
||||
|
||||
Syntax
|
||||
~~~~~~
|
||||
|
||||
----
|
||||
$ sqoop eval (generic-args) (eval-args)
|
||||
$ sqoop-eval (generic-args) (eval-args)
|
||||
----
|
||||
|
||||
Although the Hadoop generic arguments must preceed any eval arguments,
|
||||
the eval arguments can be entered in any order with respect to one
|
||||
another.
|
||||
|
||||
|
||||
include::common-args.txt[]
|
||||
|
||||
.SQL evaluation arguments:
|
||||
[grid="all"]
|
||||
`-----------------------------`-------------------------------------------
|
||||
Argument Description
|
||||
--------------------------------------------------------------------------
|
||||
+-e,\--query <statement>+ Execute '+statement+' in SQL.
|
||||
--------------------------------------------------------------------------
|
||||
|
||||
Example Invocations
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Select ten records from the +employees+ table:
|
||||
|
||||
----
|
||||
$ sqoop eval --connect jdbc:mysql://db.example.com/corp \
|
||||
--query "SELECT * FROM employees LIMIT 10"
|
||||
----
|
||||
|
||||
|
153
src/docs/user/export.txt
Normal file
153
src/docs/user/export.txt
Normal file
@ -0,0 +1,153 @@
|
||||
|
||||
////
|
||||
Licensed to Cloudera, Inc. under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
////
|
||||
|
||||
|
||||
+sqoop-export+
|
||||
--------------
|
||||
|
||||
|
||||
Purpose
|
||||
~~~~~~~
|
||||
|
||||
The +export+ tool exports a set of files from HDFS back to an RDBMS.
|
||||
The target table must already exist in the database. The input files
|
||||
are read and parsed into a set of records according to the
|
||||
user-specified delimiters. These are then transformed into a set of
|
||||
+INSERT+ statements that inject the records into the database.
|
||||
|
||||
Syntax
|
||||
~~~~~~
|
||||
|
||||
----
|
||||
$ sqoop export (generic-args) (import-args)
|
||||
$ sqoop-export (generic-args) (import-args)
|
||||
----
|
||||
|
||||
Although the Hadoop generic arguments must preceed any export arguments,
|
||||
the export arguments can be entered in any order with respect to one
|
||||
another.
|
||||
|
||||
|
||||
include::common-args.txt[]
|
||||
|
||||
.Export control arguments:
|
||||
[grid="all"]
|
||||
`-------------------------`------------------------------------------
|
||||
Argument Description
|
||||
---------------------------------------------------------------------
|
||||
+\--direct+ Use direct export fast path
|
||||
+\--export-dir <dir>+ HDFS source path for the export
|
||||
+-m,\--num-mappers <n>+ Use 'n' map tasks to export in parallel
|
||||
+\--table <table-name>+ Table to populate
|
||||
---------------------------------------------------------------------
|
||||
|
||||
The +\--table+ and +\--export-dir+ arguments are required. These
|
||||
specify the table to populate in the database, and the
|
||||
directory in HDFS that contains the source data.
|
||||
|
||||
You can control the number of mappers independently from the number of
|
||||
files present in the directory. Export performance depends on the
|
||||
degree of parallelism. By default, Sqoop will use four tasks in
|
||||
parallel for the export process. This may not be optimal; you will
|
||||
need to experiment with your own particular setup. Additional tasks
|
||||
may offer better concurrency, but if the database is already
|
||||
bottlenecked on updating indices, invoking triggers, and so on, then
|
||||
additional load may decrease performance. The +\--num-mappers+ or +-m+
|
||||
arguments control the number of map tasks, which is the degree of
|
||||
parallelism used.
|
||||
|
||||
MySQL provides a direct mode for exports as well, using the
|
||||
+mysqlimport+ tool. When exporting to MySQL, use the +\--direct+ argument
|
||||
to specify this codepath. This may be
|
||||
higher-performance than the standard JDBC codepath.
|
||||
|
||||
include::input-args.txt[]
|
||||
|
||||
include::output-args.txt[]
|
||||
|
||||
Sqoop automatically generates code to parse and interpret records of the
|
||||
files containing the data to be exported back to the database. If
|
||||
these files were created with non-default delimiters (comma-separated
|
||||
fields with newline-separated records), you should specify
|
||||
the same delimiters again so that Sqoop can parse your files.
|
||||
|
||||
If you specify incorrect delimiters, Sqoop will fail to find enough
|
||||
columns per line. This will cause export map tasks to fail by throwing
|
||||
+ParseExceptions+.
|
||||
|
||||
include::codegen-args.txt[]
|
||||
|
||||
If the records to be exported were generated as the result of a
|
||||
previous import, then the original generated class can be used to read
|
||||
the data back. Specifying +\--jar-file+ and +\--class-name+ obviate
|
||||
the need to specify delimiters in this case.
|
||||
|
||||
Exports and Transactions
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Exports are performed by multiple writers in parallel. Each writer
|
||||
uses a separate connection to the database; these have separate
|
||||
transactions from one another. Sqoop uses the multi-row +INSERT+
|
||||
syntax to insert up to 100 records per statement. Every 100
|
||||
statements, the current transaction within a writer task is committed,
|
||||
causing a commit every 10,000 rows. This ensures that transaction
|
||||
buffers do not grow without bound, and cause out-of-memory conditions.
|
||||
Therefore, an export is not an atomic process. Partial results from
|
||||
the export will become visible before the export is complete.
|
||||
|
||||
Failed Exports
|
||||
~~~~~~~~~~~~~~
|
||||
|
||||
Exports may fail for a number of reasons:
|
||||
|
||||
- Loss of connectivity from the Hadoop cluster to the database (either
|
||||
due to hardware fault, or server software crashes)
|
||||
- Attempting to +INSERT+ a row which violates a consistency constraint
|
||||
(for example, inserting a duplicate primary key value)
|
||||
- Attempting to parse an incomplete or malformed record from the HDFS
|
||||
source data
|
||||
- Attempting to parse records using incorrect delimiters
|
||||
- Capacity issues (such as insufficient RAM or disk space)
|
||||
|
||||
If an export map task fails due to these or other reasons, it will
|
||||
cause the export job to fail. The results of a failed export are
|
||||
undefined. Each export map task operates in a separate transaction.
|
||||
Furthermore, individual map tasks +commit+ their current transaction
|
||||
periodically. If a task fails, the current transaction will be rolled
|
||||
back. Any previously-committed transactions will remain durable in the
|
||||
database, leading to a partially-complete export.
|
||||
|
||||
Example Invocations
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
A basic export to populate a table named +bar+:
|
||||
|
||||
----
|
||||
$ sqoop export --connect jdbc:mysql://db.example.com/foo --table bar \
|
||||
--export-dir /results/bar_data
|
||||
----
|
||||
|
||||
This example takes the files in +/results/bar_data+ and injects their
|
||||
contents in to the +bar+ table in the +foo+ database on +db.example.com+.
|
||||
The target table must already exist in the database. Sqoop performs
|
||||
a set of +INSERT INTO+ operations, without regard for existing content. If
|
||||
Sqoop attempts to insert rows which violate constraints in the database
|
||||
(for example, a particular primary key value already exists), then the export
|
||||
fails.
|
||||
|
||||
|
82
src/docs/user/help.txt
Normal file
82
src/docs/user/help.txt
Normal file
@ -0,0 +1,82 @@
|
||||
|
||||
////
|
||||
Licensed to Cloudera, Inc. under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
////
|
||||
|
||||
|
||||
+sqoop-help+
|
||||
------------
|
||||
|
||||
Purpose
|
||||
~~~~~~~
|
||||
|
||||
List tools available in Sqoop and explain their usage.
|
||||
|
||||
Syntax
|
||||
~~~~~~
|
||||
|
||||
----
|
||||
$ sqoop help [tool-name]
|
||||
$ sqoop-help [tool-name]
|
||||
----
|
||||
|
||||
If no tool name is provided (for example, the user runs +sqoop help+), then
|
||||
the available tools are listed. With a tool name, the usage
|
||||
instructions for that specific tool are presented on the console.
|
||||
|
||||
Example Invocations
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
List available tools:
|
||||
|
||||
----
|
||||
$ sqoop help
|
||||
usage: sqoop COMMAND [ARGS]
|
||||
|
||||
Available commands:
|
||||
codegen Generate code to interact with database records
|
||||
create-hive-table Import a table definition into Hive
|
||||
eval Evaluate a SQL statement and display the results
|
||||
export Export an HDFS directory to a database table
|
||||
|
||||
...
|
||||
|
||||
See 'sqoop help COMMAND' for information on a specific command.
|
||||
----
|
||||
|
||||
Display usage instructions for the +import+ tool:
|
||||
|
||||
----
|
||||
$ bin/sqoop help import
|
||||
usage: sqoop import [GENERIC-ARGS] [TOOL-ARGS]
|
||||
|
||||
Common arguments:
|
||||
--connect <jdbc-uri> Specify JDBC connect string
|
||||
--driver <class-name> Manually specify JDBC driver class to use
|
||||
--hadoop-home <dir> Override $HADOOP_HOME
|
||||
--help Print usage instructions
|
||||
-P Read password from console
|
||||
--password <password> Set authentication password
|
||||
--username <username> Set authentication username
|
||||
--verbose Print more information while working
|
||||
|
||||
Import control arguments:
|
||||
--as-sequencefile Imports data to SequenceFiles
|
||||
--as-textfile Imports data as plain text (default)
|
||||
...
|
||||
----
|
||||
|
||||
|
32
src/docs/user/hive-args.txt
Normal file
32
src/docs/user/hive-args.txt
Normal file
@ -0,0 +1,32 @@
|
||||
|
||||
////
|
||||
Licensed to Cloudera, Inc. under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
////
|
||||
|
||||
|
||||
.Hive arguments:
|
||||
[grid="all"]
|
||||
`-----------------------------`-------------------------------------------
|
||||
Argument Description
|
||||
--------------------------------------------------------------------------
|
||||
+\--hive-home <dir>+ Override +$HIVE_HOME+
|
||||
+\--hive-import+ Import tables into Hive (Uses Hive's \
|
||||
default delimiters if none are set.)
|
||||
+\--hive-overwrite+ Overwrite existing data in the Hive table.
|
||||
+\--hive-table <table-name>+ Sets the table name to use when importing\
|
||||
to Hive.
|
||||
--------------------------------------------------------------------------
|
||||
|
30
src/docs/user/hive-notes.txt
Normal file
30
src/docs/user/hive-notes.txt
Normal file
@ -0,0 +1,30 @@
|
||||
|
||||
////
|
||||
Licensed to Cloudera, Inc. under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
////
|
||||
|
||||
|
||||
|
||||
Schema Definition in Hive
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Hive users will note that there is not a one-to-one mapping between
|
||||
SQL types and Hive types. In general, SQL types that do not have a
|
||||
direct mapping (for example, +DATE+, +TIME+, and +TIMESTAMP+) will be coerced to
|
||||
+STRING+ in Hive. The +NUMERIC+ and +DECIMAL+ SQL types will be coerced to
|
||||
+DOUBLE+. In these cases, Sqoop will emit a warning in its log messages
|
||||
informing you of the loss of precision.
|
||||
|
59
src/docs/user/hive.txt
Normal file
59
src/docs/user/hive.txt
Normal file
@ -0,0 +1,59 @@
|
||||
|
||||
////
|
||||
Licensed to Cloudera, Inc. under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
////
|
||||
|
||||
|
||||
Importing Data Into Hive
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Sqoop's import tool's main function is to upload your data into files
|
||||
in HDFS. If you have a Hive metastore associated with your HDFS
|
||||
cluster, Sqoop can also import the data into Hive by generating and
|
||||
executing a +CREATE TABLE+ statement to define the data's layout in
|
||||
Hive. Importing data into Hive is as simple as adding the
|
||||
*+\--hive-import+* option to your Sqoop command line.
|
||||
|
||||
If the Hive table already exists, you can specify the
|
||||
*+\--hive-overwrite+* option to indicate that existing table in hive must
|
||||
be replaced. After your data is imported into HDFS or this step is
|
||||
omitted, Sqoop will generate a Hive script containing a +CREATE TABLE+
|
||||
operation defining your columns using Hive's types, and a +LOAD DATA INPATH+
|
||||
statement to move the data files into Hive's warehouse directory.
|
||||
|
||||
The script will be executed by calling
|
||||
the installed copy of hive on the machine where Sqoop is run. If you have
|
||||
multiple Hive installations, or +hive+ is not in your +$PATH+, use the
|
||||
*+\--hive-home+* option to identify the Hive installation directory.
|
||||
Sqoop will use +$HIVE_HOME/bin/hive+ from here.
|
||||
|
||||
NOTE: This function is incompatible with +\--as-sequencefile+.
|
||||
|
||||
Hive's text parser does not support escaping or enclosing
|
||||
characters. Sqoop will print a warning if you use +\--escaped-by+,
|
||||
+\--enclosed-by+, or +\--optionally-enclosed-by+ because Hive does not know
|
||||
how to parse these. It will pass the field and record delimiters through
|
||||
to Hive. If you do not set any delimiters and do use +\--hive-import+,
|
||||
the field delimiter will be set to +^A+ and the record delimiter will
|
||||
be set to +\n+ to be consistent with Hive's defaults. It is important when
|
||||
importing data to Hive to choose unambiguous field and record delimiters
|
||||
due to the lack of escape and enclosing characters.
|
||||
|
||||
The table name used in Hive is, by default, the same as that of the
|
||||
source table. You can control the output table name with the +\--hive-table+
|
||||
option.
|
||||
|
||||
|
112
src/docs/user/import-all-tables.txt
Normal file
112
src/docs/user/import-all-tables.txt
Normal file
@ -0,0 +1,112 @@
|
||||
|
||||
////
|
||||
Licensed to Cloudera, Inc. under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
////
|
||||
|
||||
|
||||
+sqoop-import-all-tables+
|
||||
-------------------------
|
||||
|
||||
Purpose
|
||||
~~~~~~~
|
||||
|
||||
The +import-all-tables+ tool imports a set of tables from an RDBMS to HDFS.
|
||||
Data from each table is stored in a separate directory in HDFS.
|
||||
|
||||
For the +import-all-tables+ tool to be useful, the following conditions
|
||||
must be met:
|
||||
|
||||
- Each table must have a single-column primary key.
|
||||
- You must intend to import all columns of each table.
|
||||
- You must not intend to use non-default splitting column, nor impose
|
||||
any conditions via a +WHERE+ clause.
|
||||
|
||||
Syntax
|
||||
~~~~~~
|
||||
|
||||
----
|
||||
$ sqoop import-all-tables (generic-args) (import-args)
|
||||
$ sqoop-import-all-tables (generic-args) (import-args)
|
||||
----
|
||||
|
||||
Although the Hadoop generic arguments must preceed any import arguments,
|
||||
the import arguments can be entered in any order with respect to one
|
||||
another.
|
||||
|
||||
include::common-args.txt[]
|
||||
|
||||
.Import control arguments:
|
||||
[grid="all"]
|
||||
`----------------------------`---------------------------------------
|
||||
Argument Description
|
||||
---------------------------------------------------------------------
|
||||
+\--as-sequencefile+ Imports data to SequenceFiles
|
||||
+\--as-textfile+ Imports data as plain text (default)
|
||||
+\--direct+ Use direct import fast path
|
||||
+\--direct-split-size <n>+ Split the input stream every 'n' bytes when\
|
||||
importing in direct mode
|
||||
+\--inline-lob-limit <n>+ Set the maximum size for an inline LOB
|
||||
+-m,\--num-mappers <n>+ Use 'n' map tasks to import in parallel
|
||||
+\--warehouse-dir <dir>+ HDFS parent for table destination
|
||||
+-z,\--compress+ Enable compression
|
||||
---------------------------------------------------------------------
|
||||
|
||||
These arguments behave in the same manner as they do when used for the
|
||||
+sqoop-import+ tool, but the +\--table+, +\--split-by+, +\--columns+,
|
||||
and +\--where+ arguments are invalid for +sqoop-import-all-tables+.
|
||||
|
||||
include::output-args.txt[]
|
||||
|
||||
include::input-args.txt[]
|
||||
|
||||
include::hive-args.txt[]
|
||||
|
||||
.Code generation arguments:
|
||||
[grid="all"]
|
||||
`------------------------`-----------------------------------------------
|
||||
Argument Description
|
||||
-------------------------------------------------------------------------
|
||||
+\--bindir <dir>+ Output directory for compiled objects
|
||||
+\--jar-file <file>+ Disable code generation; use specified jar
|
||||
+\--outdir <dir>+ Output directory for generated code
|
||||
+\--package-name <name>+ Put auto-generated classes in this package
|
||||
-------------------------------------------------------------------------
|
||||
|
||||
The +import-all-tables+ tool does not support the +\--class-name+ argument.
|
||||
You may, however, specify a package with +\--package-name+ in which all
|
||||
generated classes will be placed.
|
||||
|
||||
Example Invocations
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Import all tables from the +corp+ database:
|
||||
|
||||
----
|
||||
$ sqoop import-all-tables --connect jdbc:mysql://db.foo.com/corp
|
||||
----
|
||||
|
||||
Verifying that it worked:
|
||||
|
||||
----
|
||||
$ hadoop fs -ls
|
||||
Found 4 items
|
||||
drwxr-xr-x - someuser somegrp 0 2010-04-27 17:15 /user/someuser/EMPLOYEES
|
||||
drwxr-xr-x - someuser somegrp 0 2010-04-27 17:15 /user/someuser/PAYCHECKS
|
||||
drwxr-xr-x - someuser somegrp 0 2010-04-27 17:15 /user/someuser/DEPARTMENTS
|
||||
drwxr-xr-x - someuser somegrp 0 2010-04-27 17:15 /user/someuser/OFFICE_SUPPLIES
|
||||
----
|
||||
|
||||
|
500
src/docs/user/import.txt
Normal file
500
src/docs/user/import.txt
Normal file
@ -0,0 +1,500 @@
|
||||
|
||||
////
|
||||
Licensed to Cloudera, Inc. under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
////
|
||||
|
||||
|
||||
+sqoop-import+
|
||||
--------------
|
||||
|
||||
Purpose
|
||||
~~~~~~~
|
||||
|
||||
The +import+ tool imports an individual table from an RDBMS to HDFS.
|
||||
Each row from a table is represented as a separate record in HDFS.
|
||||
Records can be stored as text files (one record per line), or in
|
||||
binary representation in SequenceFiles.
|
||||
|
||||
Syntax
|
||||
~~~~~~
|
||||
|
||||
----
|
||||
$ sqoop import (generic-args) (import-args)
|
||||
$ sqoop-import (generic-args) (import-args)
|
||||
----
|
||||
|
||||
While the Hadoop generic arguments must preceed any import arguments,
|
||||
you can type the import arguments in any order with respect to one
|
||||
another.
|
||||
|
||||
NOTE: In this document, arguments are grouped into collections
|
||||
organized by function. Some collections are present in several tools
|
||||
(for example, the "common" arguments). An extended description of their
|
||||
functionality is given only on the first presentation in this
|
||||
document.
|
||||
|
||||
include::common-args.txt[]
|
||||
|
||||
include::connecting.txt[]
|
||||
|
||||
.Import control arguments:
|
||||
[grid="all"]
|
||||
`-----------------------------`--------------------------------------
|
||||
Argument Description
|
||||
---------------------------------------------------------------------
|
||||
+\--as-sequencefile+ Imports data to SequenceFiles
|
||||
+\--as-textfile+ Imports data as plain text (default)
|
||||
+\--columns <col,col,col...>+ Columns to import from table
|
||||
+\--direct+ Use direct import fast path
|
||||
+\--direct-split-size <n>+ Split the input stream every 'n' bytes\
|
||||
when importing in direct mode
|
||||
+\--inline-lob-limit <n>+ Set the maximum size for an inline LOB
|
||||
+-m,\--num-mappers <n>+ Use 'n' map tasks to import in parallel
|
||||
+\--split-by <column-name>+ Column of the table used to split work\
|
||||
units
|
||||
+\--table <table-name>+ Table to read
|
||||
+\--warehouse-dir <dir>+ HDFS parent for table destination
|
||||
+\--where <where clause>+ WHERE clause to use during import
|
||||
+-z,\--compress+ Enable compression
|
||||
---------------------------------------------------------------------
|
||||
|
||||
|
||||
|
||||
Selecting the Data to Import
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Sqoop currently imports data in a table-centric fashion. Use the
|
||||
+\--table+ argument to select the table to import. For example, +\--table
|
||||
employees+. This argument can also identify a +VIEW+ or other table-like
|
||||
entity in a database.
|
||||
|
||||
By default, all columns within a table are selected for import.
|
||||
Imported data is written to HDFS in its "natural order;" that is, a
|
||||
table containing columns A, B, and C result in an import of data such
|
||||
as:
|
||||
|
||||
----
|
||||
A1,B1,C1
|
||||
A2,B2,C2
|
||||
...
|
||||
----
|
||||
|
||||
You can select a subset of columns and control their ordering by using
|
||||
the +\--columns+ argument. This should include a comma-delimited list
|
||||
of columns to import. For example: +\--columns "name,employee_id,jobtitle"+.
|
||||
|
||||
You can control which rows are imported by adding a SQL +WHERE+ clause
|
||||
to the import statement. By default, Sqoop generates statements of the
|
||||
form +SELECT <column list> FROM <table name>+. You can append a
|
||||
+WHERE+ clause to this with the +\--where+ argument. For example: +\--where
|
||||
"id > 400"+. Only rows where the +id+ column has a value greater than
|
||||
400 will be imported.
|
||||
|
||||
Controlling Parallelism
|
||||
^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Sqoop imports data in parallel from most database sources. You can
|
||||
specify the number
|
||||
of map tasks (parallel processes) to use to perform the import by
|
||||
using the +-m+ or +\--num-mappers+ argument. Each of these arguments
|
||||
takes an integer value which corresponds to the degree of parallelism
|
||||
to employ. By default, four tasks are used. Some databases may see
|
||||
improved performance by increasing this value to 8 or 16. Do not
|
||||
increase the degree of parallelism greater than that available within
|
||||
your MapReduce cluster; tasks will run serially and will likely
|
||||
increase the amount of time required to perform the import. Likewise,
|
||||
do not increase the degree of parallism higher than that which your
|
||||
database can reasonably support. Connecting 100 concurrent clients to
|
||||
your database may increase the load on the database server to a point
|
||||
where performance suffers as a result.
|
||||
|
||||
When performing parallel imports, Sqoop needs a criterion by which it
|
||||
can split the workload. Sqoop uses a _splitting column_ to split the
|
||||
workload. By default, Sqoop will identify the primary key column (if
|
||||
present) in a table and use it as the splitting column. The low and
|
||||
high values for the splitting column are retrieved from the database,
|
||||
and the map tasks operate on evenly-sized components of the total
|
||||
range. For example, if you had a table with a primary key column of
|
||||
+id+ whose minimum value was 0 and maximum value was 1000, and Sqoop
|
||||
was directed to use 4 tasks, Sqoop would run four processes which each
|
||||
execute SQL statements of the form +SELECT * FROM sometable WHERE id
|
||||
>= lo AND id < hi+, with +(lo, hi)+ set to (0, 250), (250, 500),
|
||||
(500, 750), and (750, 1001) in the different tasks.
|
||||
|
||||
If the actual values for the primary key are not uniformly distributed
|
||||
across its range, then this can result in unbalanced tasks. You should
|
||||
explicitly choose a different column with the +\--split-by+ argument.
|
||||
For example, +\--split-by employee_id+. Sqoop cannot currently split on
|
||||
multi-column indices. If your table has no index column, or has a
|
||||
multi-column key, then you must also manually choose a splitting
|
||||
column.
|
||||
|
||||
Controlling the Import Process
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
By default, the import process will use JDBC which provides a
|
||||
reasonable cross-vendor import channel. Some databases can perform
|
||||
imports in a more high-performance fashion by using database-specific
|
||||
data movement tools. For example, MySQL provides the +mysqldump+ tool
|
||||
which can export data from MySQL to other systems very quickly. By
|
||||
supplying the +\--direct+ argument, you are specifying that Sqoop
|
||||
should attempt the direct import channel. This channel may be
|
||||
higher performance than using JDBC. Currently, direct mode does not
|
||||
support imports of large object columns.
|
||||
|
||||
When importing from PostgreSQL in conjunction with direct mode, you
|
||||
can split the import into separate files after
|
||||
individual files reach a certain size. This size limit is controlled
|
||||
with the +\--direct-split-size+ argument.
|
||||
|
||||
By default, Sqoop will import a table named +foo+ to a directory named
|
||||
+foo+ inside your home directory in HDFS. For example, if your
|
||||
username is +someuser+, then the import tool will write to
|
||||
+/user/someuser/foo/(files)+. You can adjust the parent directory of
|
||||
the import with the +\--warehouse-dir+ argument. For example:
|
||||
|
||||
----
|
||||
$ sqoop import --connnect <connect-str> --table foo --warehouse-dir /shared \
|
||||
...
|
||||
----
|
||||
|
||||
This command would write to a set of files in the +/shared/foo/+ directory.
|
||||
|
||||
When using direct mode, you can specify additional arguments which
|
||||
should be passed to the underlying tool. If the argument
|
||||
+\--+ is given on the command-line, then subsequent arguments are sent
|
||||
directly to the underlying tool. For example, the following adjusts
|
||||
the character set used by +mysqldump+:
|
||||
|
||||
----
|
||||
$ sqoop import --connect jdbc:mysql://server.foo.com/db --table bar \
|
||||
--direct -- --default-character-set=latin1
|
||||
----
|
||||
|
||||
File Formats
|
||||
^^^^^^^^^^^^
|
||||
|
||||
You can import data in one of two file formats: delimited text or
|
||||
SequenceFiles.
|
||||
|
||||
Delimited text is the default import format. You can also specify it
|
||||
explicitly by using the +\--as-textfile+ argument. This argument will write
|
||||
string-based representations of each record to the output files, with
|
||||
delimiter characters between individual columns and rows. These
|
||||
delimiters may be commas, tabs, or other characters. (The delimiters
|
||||
can be selected; see "Output line formatting arguments.") The
|
||||
following is the results of an example text-based import:
|
||||
|
||||
----
|
||||
1,here is a message,2010-05-01
|
||||
2,happy new year!,2010-01-01
|
||||
3,another message,2009-11-12
|
||||
----
|
||||
|
||||
Delimited text is appropriate for most non-binary data types. It also
|
||||
readily supports further manipulation by other tools, such as Hive.
|
||||
|
||||
SequenceFiles are a binary format that store individual records in
|
||||
custom record-specific data types. These data types are manifested as
|
||||
Java classes. Sqoop will automatically generate these data types for
|
||||
you. This format supports exact storage of all data in binary
|
||||
representations, and is appropriate for storing binary data
|
||||
(for example, +VARBINARY+ columns), or data that will be principly
|
||||
manipulated by custom MapReduce programs (reading from SequenceFiles
|
||||
is higher-performance than reading from text files, as records do not
|
||||
need to be parsed).
|
||||
|
||||
By default, data is not compressed. You can compress
|
||||
your data by using the deflate (gzip) algorithm with the +-z+ or
|
||||
+\--compress+ argument. This applies to both SequenceFiles or text
|
||||
files.
|
||||
|
||||
Large Objects
|
||||
^^^^^^^^^^^^^
|
||||
|
||||
Sqoop handles large objects (+BLOB+ and +CLOB+ columns) in particular
|
||||
ways. If this data is truly large, then these columns should not be
|
||||
fully materialized in memory for manipulation, as most columns are.
|
||||
Instead, their data is handled in a streaming fashion. Large objects
|
||||
can be stored inline with the rest of the data, in which case they are
|
||||
fully materialized in memory on every access, or they can be stored in
|
||||
a secondary storage file linked to the primary data storage. By
|
||||
default, large objects less than 16 MB in size are stored inline with
|
||||
the rest of the data. At a larger size, they are stored in files in
|
||||
the +_lobs+ subdirectory of the import target directory. These files
|
||||
are stored in a separate format optimized for large record storage,
|
||||
which can accomodate records of up to 2^63 bytes each. The size at
|
||||
which lobs spill into separate files is controlled by the
|
||||
+\--inline-lob-limit+ argument, which takes a parameter specifying the
|
||||
largest lob size to keep inline, in bytes. If you set the inline LOB
|
||||
limit to 0, all large objects will be placed in external
|
||||
storage.
|
||||
|
||||
include::output-args.txt[]
|
||||
|
||||
When importing to delimited files, the choice of delimiter is
|
||||
important. Delimiters which appear inside string-based fields may
|
||||
cause ambiguous parsing of the imported data by subsequent analysis
|
||||
passes. For example, the string +"Hello, pleased to meet you"+ should
|
||||
not be imported with the end-of-field delimiter set to a comma.
|
||||
|
||||
Delimiters may be specified as:
|
||||
|
||||
- a character (+\--fields-terminated-by X+)
|
||||
- an escape character (+\--fields-terminated-by \t+). Supported escape
|
||||
characters are:
|
||||
* +\b+ (backspace)
|
||||
* +\n+ (newline)
|
||||
* +\r+ (carriage return)
|
||||
* +\t+ (tab)
|
||||
* +\"+ (double-quote)
|
||||
* +\\'+ (single-quote)
|
||||
* +\\+ (backslash)
|
||||
* +\0+ (NUL) - This will insert NUL characters between fields or lines,
|
||||
or will disable enclosing/escaping if used for one of the +\--enclosed-by+,
|
||||
+\--optionally-enclosed-by+, or +\--escaped-by+ arguments.
|
||||
- The octal representation of a A UTF-8 character's code point. This
|
||||
should be of the form +\0ooo+, where _ooo_ is the octal value.
|
||||
For example, +\--fields-terminated-by \001+ would yield the +^A+ character.
|
||||
- The hexadecimal representation of a A UTF-8 character's code point. This
|
||||
should be of the form +\0xhhh+, where _hhh_ is the hex value.
|
||||
For example, +\--fields-terminated-by \0x10+ would yield the carriage
|
||||
return character.
|
||||
|
||||
The default delimiters are a comma (+,+) for fields, a newline (+\n+) for records, no quote
|
||||
character, and no escape character. Note that this can lead to
|
||||
ambiguous/unparsible records if you import database records containing
|
||||
commas or newlines in the field data. For unambiguous parsing, both must
|
||||
be enabled. For example, via +\--mysql-delimiters+.
|
||||
|
||||
If unambiguous delimiters cannot be presented, then use _enclosing_ and
|
||||
_escaping_ characters. The combination of (optional)
|
||||
enclosing and escaping characters will allow unambiguous parsing of
|
||||
lines. For example, suppose one column of a dataset contained the
|
||||
following values:
|
||||
|
||||
----
|
||||
Some string, with a comma.
|
||||
Another "string with quotes"
|
||||
----
|
||||
|
||||
The following arguments would provide delimiters which can be
|
||||
unambiguously parsed:
|
||||
|
||||
----
|
||||
$ sqoop import --fields-terminated-by , --escaped-by \\ --enclosed-by '\"' ...
|
||||
----
|
||||
|
||||
(Note that to prevent the shell from mangling the enclosing character,
|
||||
we have enclosed that argument itself in single-quotes.)
|
||||
|
||||
The result of the above arguments applied to the above dataset would
|
||||
be:
|
||||
|
||||
----
|
||||
"Some string, with a comma.","1","2","3"...
|
||||
"Another \"string with quotes\"","4","5","6"...
|
||||
----
|
||||
|
||||
Here the imported strings are shown in the context of additional
|
||||
columns (+"1","2","3"+, etc.) to demonstrate the full effect of enclosing
|
||||
and escaping. The enclosing character is only strictly necessary when
|
||||
delimiter characters appear in the imported text. The enclosing
|
||||
character can therefore be specified as optional:
|
||||
|
||||
----
|
||||
$ sqoop import --optionally-enclosed-by '\"' (the rest as above)...
|
||||
----
|
||||
|
||||
Which would result in the following import:
|
||||
|
||||
----
|
||||
"Some string, with a comma.",1,2,3...
|
||||
"Another \"string with quotes\"",4,5,6...
|
||||
----
|
||||
|
||||
NOTE: Hive does not support enclosing and escaping characters. You
|
||||
must choose unambiguous field and record-terminating delimiters
|
||||
without the help of escaping and enclosing characters when
|
||||
working with Hive; this is a limitation of Hive's input parsing
|
||||
abilities.
|
||||
|
||||
The +\--mysql-delimiters+ argument is a shorthand argument which uses
|
||||
the default delimiters for the +mysqldump+ program.
|
||||
If you use the +mysqldump+ delimiters in conjunction with a
|
||||
direct-mode import (with +\--direct+), very fast imports can be
|
||||
achieved.
|
||||
|
||||
While the choice of delimiters is most important for a text-mode
|
||||
import, it is still relevant if you import to SequenceFiles with
|
||||
+\--as-sequencefile+. The generated class' +toString()+ method
|
||||
will use the delimiters you specify, so subsequent formatting of
|
||||
the output data will rely on the delimiters you choose.
|
||||
|
||||
include::input-args.txt[]
|
||||
|
||||
When Sqoop imports data to HDFS, it generates a Java class which can
|
||||
reinterpret the text files that it creates when doing a
|
||||
delimited-format import. The delimiters are chosen with arguments such
|
||||
as +\--fields-terminated-by+; this controls both how the data is
|
||||
written to disk, and how the generated +parse()+ method reinterprets
|
||||
this data. The delimiters used by the +parse()+ method can be chosen
|
||||
independently of the output arguments, by using
|
||||
+\--input-fields-terminated-by+, and so on. This is useful, for example, to
|
||||
generate classes which can parse records created with one set of
|
||||
delimiters, and emit the records to a different set of files using a
|
||||
separate set of delimiters.
|
||||
|
||||
include::hive-args.txt[]
|
||||
|
||||
include::hive.txt[]
|
||||
|
||||
include::codegen-args.txt[]
|
||||
|
||||
As mentioned earlier, a byproduct of importing a table to HDFS is a
|
||||
class which can manipulate the imported data. If the data is stored in
|
||||
SequenceFiles, this class will be used for the data's serialization
|
||||
container. Therefore, you should use this class in your subsequent
|
||||
MapReduce processing of the data.
|
||||
|
||||
The class is typically named after the table; a table named +foo+ will
|
||||
generate a class named +foo+. You may want to override this class
|
||||
name. For example, if your table is named +EMPLOYEES+, you may want to
|
||||
specify +\--class-name Employee+ instead. Similarly, you can specify
|
||||
just the package name with +\--package-name+. The following import
|
||||
generates a class named +com.foocorp.SomeTable+:
|
||||
|
||||
----
|
||||
$ sqoop import --connect <connect-str> --table SomeTable --package-name com.foocorp
|
||||
----
|
||||
|
||||
The +.java+ source file for your class will be written to the current
|
||||
working directory when you run +sqoop+. You can control the output
|
||||
directory with +\--outdir+. For example, +\--outdir src/generated/+.
|
||||
|
||||
The import process compiles the source into +.class+ and +.jar+ files;
|
||||
these are ordinarily stored under +/tmp+. You can select an alternate
|
||||
target directory with +\--bindir+. For example, +\--bindir /scratch+.
|
||||
|
||||
If you already have a compiled class that can be used to perform the
|
||||
import and want to suppress the code-generation aspect of the import
|
||||
process, you can use an existing jar and class by
|
||||
providing the +\--jar-file+ and +\--class-name+ options. For example:
|
||||
|
||||
----
|
||||
$ sqoop import --table SomeTable --jar-file mydatatypes.jar \
|
||||
--class-name SomeTableType
|
||||
----
|
||||
|
||||
This command will load the +SomeTableType+ class out of +mydatatypes.jar+.
|
||||
|
||||
|
||||
Example Invocations
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
The following examples illustrate how to use the import tool in a variety
|
||||
of situations.
|
||||
|
||||
A basic import of a table named +EMPLOYEES+ in the +corp+ database:
|
||||
|
||||
----
|
||||
$ sqoop import --connect jdbc:mysql://db.foo.com/corp --table EMPLOYEES
|
||||
----
|
||||
|
||||
A basic import requiring a login:
|
||||
|
||||
----
|
||||
$ sqoop import --connect jdbc:mysql://db.foo.com/corp --table EMPLOYEES \
|
||||
--username SomeUser -P
|
||||
Enter password: (hidden)
|
||||
----
|
||||
|
||||
Selecting specific columns from the +EMPLOYEES+ table:
|
||||
|
||||
----
|
||||
$ sqoop import --connect jdbc:mysql://db.foo.com/corp --table EMPLOYEES \
|
||||
--columns "employee_id,first_name,last_name,job_title"
|
||||
----
|
||||
|
||||
Controlling the import parallelism (using 8 parallel tasks):
|
||||
|
||||
----
|
||||
$ sqoop import --connect jdbc:mysql://db.foo.com/corp --table EMPLOYEES \
|
||||
-m 8
|
||||
----
|
||||
|
||||
Enabling the MySQL "direct mode" fast path:
|
||||
|
||||
----
|
||||
$ sqoop import --connect jdbc:mysql://db.foo.com/corp --table EMPLOYEES \
|
||||
--direct
|
||||
----
|
||||
|
||||
Storing data in SequenceFiles, and setting the generated class name to
|
||||
+com.foocorp.Employee+:
|
||||
|
||||
----
|
||||
$ sqoop import --connect jdbc:mysql://db.foo.com/corp --table EMPLOYEES \
|
||||
--class-name com.foocorp.Employee --as-sequencefile
|
||||
----
|
||||
|
||||
Specifying the delimiters to use in a text-mode import:
|
||||
|
||||
----
|
||||
$ sqoop import --connect jdbc:mysql://db.foo.com/corp --table EMPLOYEES \
|
||||
--fields-terminated-by '\t' --lines-terminated-by '\n' \
|
||||
--optionally-enclosed-by '\"'
|
||||
----
|
||||
|
||||
Importing the data to Hive:
|
||||
|
||||
----
|
||||
$ sqoop import --connect jdbc:mysql://db.foo.com/corp --table EMPLOYEES \
|
||||
--hive-import
|
||||
----
|
||||
|
||||
Importing only new employees:
|
||||
|
||||
----
|
||||
$ sqoop import --connect jdbc:mysql://db.foo.com/corp --table EMPLOYEES \
|
||||
--where "start_date > '2010-01-01'"
|
||||
----
|
||||
|
||||
Changing the splitting column from the default:
|
||||
|
||||
----
|
||||
$ sqoop import --connect jdbc:mysql://db.foo.com/corp --table EMPLOYEES \
|
||||
--split-by dept_id
|
||||
----
|
||||
|
||||
Verifying that an import was successful:
|
||||
|
||||
----
|
||||
$ hadoop fs -ls EMPLOYEES
|
||||
Found 5 items
|
||||
drwxr-xr-x - someuser somegrp 0 2010-04-27 16:40 /user/someuser/EMPLOYEES/_logs
|
||||
-rw-r--r-- 1 someuser somegrp 2913511 2010-04-27 16:40 /user/someuser/EMPLOYEES/part-m-00000
|
||||
-rw-r--r-- 1 someuser somegrp 1683938 2010-04-27 16:40 /user/someuser/EMPLOYEES/part-m-00001
|
||||
-rw-r--r-- 1 someuser somegrp 7245839 2010-04-27 16:40 /user/someuser/EMPLOYEES/part-m-00002
|
||||
-rw-r--r-- 1 someuser somegrp 7842523 2010-04-27 16:40 /user/someuser/EMPLOYEES/part-m-00003
|
||||
|
||||
$ hadoop fs -cat EMPLOYEES/part-m-00000 | head -n 10
|
||||
0,joe,smith,engineering
|
||||
1,jane,doe,marketing
|
||||
...
|
||||
----
|
||||
|
||||
|
34
src/docs/user/input-args.txt
Normal file
34
src/docs/user/input-args.txt
Normal file
@ -0,0 +1,34 @@
|
||||
|
||||
////
|
||||
Licensed to Cloudera, Inc. under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
////
|
||||
|
||||
|
||||
.Input parsing arguments:
|
||||
[grid="all"]
|
||||
`----------------------------------------`----------------------------------
|
||||
Argument Description
|
||||
----------------------------------------------------------------------------
|
||||
+\--input-enclosed-by <char>+ Sets a required field encloser
|
||||
+\--input-escaped-by <char>+ Sets the input escape \
|
||||
character
|
||||
+\--input-fields-terminated-by <char>+ Sets the input field separator
|
||||
+\--input-lines-terminated-by <char>+ Sets the input end-of-line \
|
||||
character
|
||||
+\--input-optionally-enclosed-by <char>+ Sets a field enclosing \
|
||||
character
|
||||
----------------------------------------------------------------------------
|
||||
|
45
src/docs/user/intro.txt
Normal file
45
src/docs/user/intro.txt
Normal file
@ -0,0 +1,45 @@
|
||||
|
||||
////
|
||||
Licensed to Cloudera, Inc. under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
////
|
||||
|
||||
|
||||
Introduction
|
||||
------------
|
||||
|
||||
Sqoop is a tool designed to transfer data between Hadoop and
|
||||
relational databases. You can use Sqoop to import data from a
|
||||
relational database management system (RDBMS) such as MySQL or Oracle
|
||||
into the Hadoop Distributed File System (HDFS),
|
||||
transform the data in Hadoop MapReduce, and then export the data back
|
||||
into an RDBMS.
|
||||
|
||||
Sqoop automates most of this process, relying on the database to
|
||||
describe the schema for the data to be imported. Sqoop uses MapReduce
|
||||
to import and export the data, which provides parallel operation as
|
||||
well as fault tolerance.
|
||||
|
||||
This document describes how to get started using Sqoop to move data
|
||||
between databases and Hadoop and provides reference information for
|
||||
the operation of the Sqoop command-line tool suite. This document is
|
||||
intended for:
|
||||
|
||||
- System and application programmers
|
||||
- System administrators
|
||||
- Database administrators
|
||||
- Data analysts
|
||||
- Data engineers
|
||||
|
@ -17,27 +17,39 @@
|
||||
////
|
||||
|
||||
|
||||
Generated Class Names
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
+sqoop-list-databases+
|
||||
----------------------
|
||||
|
||||
By default, classes are named after the table they represent. e.g.,
|
||||
+sqoop --table foo+ will generate a file named +foo.java+. You can
|
||||
override the generated class name with the +--class-name+ argument.
|
||||
Purpose
|
||||
~~~~~~~
|
||||
|
||||
List database schemas present on a server.
|
||||
|
||||
Syntax
|
||||
~~~~~~
|
||||
|
||||
----
|
||||
$ sqoop --connect jdbc:mysql://database.example.com/employees \
|
||||
--table employee_names --class-name com.example.EmployeeNames
|
||||
$ sqoop list-databases (generic-args) (list-databases-args)
|
||||
$ sqoop-list-databases (generic-args) (list-databases-args)
|
||||
----
|
||||
_This generates a file named +com/example/EmployeeNames.java+_
|
||||
|
||||
If you want to specify a package name for generated classes, but
|
||||
still want them to be named after the table they represent, you
|
||||
can instead use the argument +--package-name+:
|
||||
Although the Hadoop generic arguments must preceed any list-databases
|
||||
arguments, the list-databases arguments can be entered in any order
|
||||
with respect to one another.
|
||||
|
||||
include::common-args.txt[]
|
||||
|
||||
Example Invocations
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
List database schemas available on a MySQL server:
|
||||
|
||||
----
|
||||
$ sqoop --connect jdbc:mysql://database.example.com/employees \
|
||||
--table employee_names --package-name com.example
|
||||
$ sqoop list-databases --connect jdbc:mysql://database.example.com/
|
||||
information_schema
|
||||
employees
|
||||
----
|
||||
_This generates a file named +com/example/employee_names.java+_
|
||||
|
||||
NOTE: This only works with HSQLDB and MySQL. A vendor-agnostic implementation
|
||||
of this function has not yet been implemented.
|
||||
|
54
src/docs/user/list-tables.txt
Normal file
54
src/docs/user/list-tables.txt
Normal file
@ -0,0 +1,54 @@
|
||||
|
||||
////
|
||||
Licensed to Cloudera, Inc. under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
////
|
||||
|
||||
|
||||
+sqoop-list-tables+
|
||||
-------------------
|
||||
|
||||
Purpose
|
||||
~~~~~~~
|
||||
|
||||
List tables present in a database.
|
||||
|
||||
Syntax
|
||||
~~~~~~
|
||||
|
||||
----
|
||||
$ sqoop list-tables (generic-args) (list-tables-args)
|
||||
$ sqoop-list-tables (generic-args) (list-tables-args)
|
||||
----
|
||||
|
||||
Although the Hadoop generic arguments must preceed any list-tables
|
||||
arguments, the list-tables arguments can be entered in any order
|
||||
with respect to one another.
|
||||
|
||||
include::common-args.txt[]
|
||||
|
||||
Example Invocations
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
List tables available in the "corp" database:
|
||||
|
||||
----
|
||||
$ sqoop list-tables --connect jdbc:mysql://database.example.com/corp
|
||||
employees
|
||||
payroll_checks
|
||||
job_descriptions
|
||||
office_supplies
|
||||
----
|
||||
|
35
src/docs/user/output-args.txt
Normal file
35
src/docs/user/output-args.txt
Normal file
@ -0,0 +1,35 @@
|
||||
|
||||
////
|
||||
Licensed to Cloudera, Inc. under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
////
|
||||
|
||||
.Output line formatting arguments:
|
||||
[grid="all"]
|
||||
`----------------------------------`----------------------------------
|
||||
Argument Description
|
||||
----------------------------------------------------------------------
|
||||
+\--enclosed-by <char>+ Sets a required field enclosing \
|
||||
character
|
||||
+\--escaped-by <char>+ Sets the escape character
|
||||
+\--fields-terminated-by <char>+ Sets the field separator character
|
||||
+\--lines-terminated-by <char>+ Sets the end-of-line character
|
||||
+\--mysql-delimiters+ Uses MySQL's default delimiter set:\
|
||||
fields: +,+ lines: +\n+ \
|
||||
escaped-by: +\+ \
|
||||
optionally-enclosed-by: +'+
|
||||
+\--optionally-enclosed-by <char>+ Sets a field enclosing character
|
||||
----------------------------------------------------------------------
|
||||
|
@ -19,26 +19,26 @@
|
||||
The delimiters used to separate fields and records can be specified
|
||||
on the command line, as can a quoting character and an escape character
|
||||
(for quoting delimiters inside a values). Data imported with
|
||||
+--as-textfile+ will be formatted according to these parameters. Classes
|
||||
+\--as-textfile+ will be formatted according to these parameters. Classes
|
||||
generated by Sqoop will encode this information, so using +toString()+
|
||||
from a data record stored +--as-sequencefile+ will reproduce your
|
||||
from a data record stored +\--as-sequencefile+ will reproduce your
|
||||
specified formatting.
|
||||
|
||||
The +(char)+ argument for each argument in this section can be specified
|
||||
either as a normal character (e.g., +--fields-terminated-by ,+) or via
|
||||
either as a normal character (e.g., +\--fields-terminated-by ,+) or via
|
||||
an escape sequence. Arguments of the form +\0xhhh+ will be interpreted
|
||||
as a hexidecimal representation of a character with hex number _hhh_.
|
||||
Arguments of the form +\0ooo+ will be treated as an octal representation
|
||||
of a character represented by octal number _ooo_. The special escapes
|
||||
+\n+, +\r+, +\"+, +\b+, +\t+, and +\\+ act as they do inside Java strings. +\0+ will be
|
||||
treated as NUL. This will insert NUL characters between fields or lines
|
||||
(if used for +--fields-terminated-by+ or +--lines-terminated-by+), or will
|
||||
disable enclosing/escaping if used for one of the +--enclosed-by+,
|
||||
+--optionally-enclosed-by+, or +--escaped-by+ arguments.
|
||||
(if used for +\--fields-terminated-by+ or +\--lines-terminated-by+), or will
|
||||
disable enclosing/escaping if used for one of the +\--enclosed-by+,
|
||||
+\--optionally-enclosed-by+, or +\--escaped-by+ arguments.
|
||||
|
||||
The default delimiters are +,+ for fields, +\n+ for records, no quote
|
||||
character, and no escape character. Note that this can lead to
|
||||
ambiguous/unparsible records if you import database records containing
|
||||
commas or newlines in the field data. For unambiguous parsing, both must
|
||||
be enabled, e.g., via +--mysql-delimiters+.
|
||||
be enabled, e.g., via +\--mysql-delimiters+.
|
||||
|
61
src/docs/user/preface.txt
Normal file
61
src/docs/user/preface.txt
Normal file
@ -0,0 +1,61 @@
|
||||
|
||||
////
|
||||
Licensed to Cloudera, Inc. under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
////
|
||||
|
||||
|
||||
|
||||
Supported Releases
|
||||
------------------
|
||||
|
||||
This documentation applies to Sqoop v1.0.0 (June 2010).
|
||||
|
||||
Sqoop Releases
|
||||
--------------
|
||||
|
||||
Sqoop is an open source software product of Cloudera, Inc.
|
||||
|
||||
Software development for Sqoop occurs at http://github.com/cloudera/sqoop.
|
||||
At that site you can obtain:
|
||||
- New releases of Sqoop as well as its most recent source code
|
||||
- An issue tracker
|
||||
- A wiki that contains Sqoop documentation
|
||||
|
||||
Sqoop is compatible with Apache Hadoop 0.21 and Cloudera's
|
||||
Distribution of Hadoop version 3.
|
||||
|
||||
Prerequisites
|
||||
-------------
|
||||
|
||||
The following prerequisite knowledge is required for this product:
|
||||
|
||||
- Basic computer technology and terminology
|
||||
- Familiarity with command-line interfaces such as +bash+
|
||||
- Relational database management systems
|
||||
- Basic familiarity with the purpose and operation of Hadoop
|
||||
|
||||
Before you can use Sqoop, a release of Hadoop must be installed and
|
||||
configured. We recommend that you download Cloudera's Distribution
|
||||
for Hadoop (CDH3) from the Cloudera Software Archive at
|
||||
http://archive.cloudera.com for straightforward installation of Hadoop
|
||||
on Linux systems.
|
||||
|
||||
This document assumes you are using a Linux or Linux-like environment.
|
||||
If you are using Windows, you may be able to use cygwin to accomplish
|
||||
most of the following tasks. If you are using Mac OS X, you should see
|
||||
few (if any) compatibility errors. Sqoop is predominantly operated and
|
||||
tested on Linux.
|
||||
|
33
src/docs/user/support.txt
Normal file
33
src/docs/user/support.txt
Normal file
@ -0,0 +1,33 @@
|
||||
|
||||
////
|
||||
Licensed to Cloudera, Inc. under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
////
|
||||
|
||||
Getting Support
|
||||
---------------
|
||||
|
||||
Report bugs in Sqoop to the issue tracker at
|
||||
http://github.com/cloudera/sqoop/issues[].
|
||||
|
||||
For general questions and answers, a support forum is available at
|
||||
http://getsatisfaction.com/cloudera/products/cloudera_sqoop[].
|
||||
|
||||
Before contacting either forum, run your Sqoop job with the
|
||||
+\--verbose+ flag to acquire as much debugging information as
|
||||
possible. Also report the string returned by +sqoop version+ as
|
||||
well as the version of Hadoop you are running (+hadoop version+).
|
||||
|
||||
|
168
src/docs/user/tools.txt
Normal file
168
src/docs/user/tools.txt
Normal file
@ -0,0 +1,168 @@
|
||||
|
||||
////
|
||||
Licensed to Cloudera, Inc. under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
////
|
||||
|
||||
Sqoop Tools
|
||||
-----------
|
||||
|
||||
Sqoop is a collection of related tools. To use Sqoop, you specify the
|
||||
tool you want to use and the arguments that control the tool.
|
||||
|
||||
If Sqoop is compiled from its own source, you can run Sqoop without a formal
|
||||
installation process by running the +bin/sqoop+ program. Users
|
||||
of a packaged deployment of Sqoop (such as an RPM shipped with Cloudera's
|
||||
Distribution for Hadoop) will see this program installed as +/usr/bin/sqoop+.
|
||||
The remainder of this documentation will refer to this program as
|
||||
+sqoop+. For example:
|
||||
|
||||
----
|
||||
$ sqoop tool-name [tool-arguments]
|
||||
----
|
||||
|
||||
NOTE: The following examples that begin with a +$+ character indicate
|
||||
that the commands must be entered at a terminal prompt (such as
|
||||
+bash+). The +$+ character represents the prompt itself; you should
|
||||
not start these commands by typing a +$+. You can also enter commands
|
||||
inline in the text of a paragraph; for example, +sqoop help+. These
|
||||
examples do not show a +$+ prefix, but you should enter them the same
|
||||
way. Don't confuse the +$+ shell prompt in the examples with the +$+
|
||||
that precedes an environment variable name. For example, the string
|
||||
literal +$HADOOP_HOME+ includes a "+$+".
|
||||
|
||||
Sqoop ships with a help tool. To display a list of all available
|
||||
tools, type the following command:
|
||||
|
||||
----
|
||||
$ sqoop help
|
||||
usage: sqoop COMMAND [ARGS]
|
||||
|
||||
Available commands:
|
||||
codegen Generate code to interact with database records
|
||||
create-hive-table Import a table definition into Hive
|
||||
eval Evaluate a SQL statement and display the results
|
||||
export Export an HDFS directory to a database table
|
||||
help List available commands
|
||||
import Import a table from a database to HDFS
|
||||
import-all-tables Import tables from a database to HDFS
|
||||
list-databases List available databases on a server
|
||||
list-tables List available tables in a database
|
||||
version Display version information
|
||||
|
||||
See 'sqoop help COMMAND' for information on a specific command.
|
||||
----
|
||||
|
||||
You can display help for a specific tool by entering: +sqoop help
|
||||
(tool-name)+; for example, +sqoop help import+.
|
||||
|
||||
You can also add the +\--help+ argument to any command: +sqoop import
|
||||
\--help+.
|
||||
|
||||
Using Command Aliases
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
In addition to typing the +sqoop (toolname)+ syntax, you can use alias
|
||||
scripts that specify the +sqoop-(toolname)+ syntax. For example, the
|
||||
scripts +sqoop-import+, +sqoop-export+, etc. each select a specific
|
||||
tool.
|
||||
|
||||
Controlling the Hadoop Installation
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
You invoke Sqoop through the program launch capability provided by
|
||||
Hadoop. The +sqoop+ command-line program is a wrapper which runs the
|
||||
+bin/hadoop+ script shipped with Hadoop. If you have multiple
|
||||
installations of Hadoop present on your machine, you can select the
|
||||
Hadoop installation by setting the +$HADOOP_HOME+ environment
|
||||
variable.
|
||||
|
||||
For example:
|
||||
|
||||
----
|
||||
$ HADOOP_HOME=/path/to/some/hadoop sqoop import --arguments...
|
||||
----
|
||||
|
||||
or:
|
||||
|
||||
----
|
||||
$ export HADOOP_HOME=/some/path/to/hadoop
|
||||
$ sqoop import --arguments...
|
||||
-----
|
||||
|
||||
If +$HADOOP_HOME+ is not set, Sqoop will use the default installation
|
||||
location for Cloudera's Distribution for Hadoop, +/usr/lib/hadoop+.
|
||||
|
||||
The active Hadoop configuration is loaded from +$HADOOP_HOME/conf/+,
|
||||
unless the +$HADOOP_CONF_DIR+ environment variable is set.
|
||||
|
||||
|
||||
Using Generic and Specific Arguments
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
To control the operation of each Sqoop tool, you use generic and
|
||||
specific arguments.
|
||||
|
||||
For example:
|
||||
|
||||
----
|
||||
$ sqoop help import
|
||||
usage: sqoop import [GENERIC-ARGS] [TOOL-ARGS]
|
||||
|
||||
Common arguments:
|
||||
--connect <jdbc-uri> Specify JDBC connect string
|
||||
--driver <class-name> Manually specify JDBC driver class to use
|
||||
--hadoop-home <dir> Override $HADOOP_HOME
|
||||
--help Print usage instructions
|
||||
-P Read password from console
|
||||
--password <password> Set authentication password
|
||||
--username <username> Set authentication username
|
||||
--verbose Print more information while working
|
||||
|
||||
[...]
|
||||
|
||||
Generic Hadoop command-line arguments:
|
||||
(must preceed any tool-specific arguments)
|
||||
Generic options supported are
|
||||
-conf <configuration file> specify an application configuration file
|
||||
-D <property=value> use value for given property
|
||||
-fs <local|namenode:port> specify a namenode
|
||||
-jt <local|jobtracker:port> specify a job tracker
|
||||
-files <comma separated list of files> specify comma separated files to be copied to the map reduce cluster
|
||||
-libjars <comma separated list of jars> specify comma separated jar files to include in the classpath.
|
||||
-archives <comma separated list of archives> specify comma separated archives to be unarchived on the compute machines.
|
||||
|
||||
The general command line syntax is
|
||||
bin/hadoop command [genericOptions] [commandOptions]
|
||||
----
|
||||
|
||||
You must supply the generic arguments +-conf+, +-D+, and so on after the
|
||||
tool name but *before* any tool-specific arguments (such as
|
||||
+\--connect+). Note that generic Hadoop arguments are preceeded by a
|
||||
single dash character (+-+), whereas tool-specific arguments start
|
||||
with two dashes (+\--+), unless they are single character arguments such as +-P+.
|
||||
|
||||
The +-conf+, +-D+, +-fs+ and +-jt+ arguments control the configuration
|
||||
and Hadoop server settings. The +-files+, +-libjars+, and +-archives+
|
||||
arguments are not typically used with Sqoop, but they are included as
|
||||
part of Hadoop's internal argument-parsing system.
|
||||
|
||||
|
||||
Using Tools
|
||||
~~~~~~~~~~~
|
||||
|
||||
The following sections will describe each tool's operation. The
|
||||
tools are listed in the most likely order you will find them useful.
|
||||
|
@ -17,18 +17,32 @@
|
||||
////
|
||||
|
||||
|
||||
Listing Available Tables
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
+sqoop-version+
|
||||
---------------
|
||||
|
||||
Within a database, you can list the tables available for import with
|
||||
the +--list-tables+ command. The following example shows four tables available
|
||||
within the "employees" example database:
|
||||
Purpose
|
||||
~~~~~~~
|
||||
|
||||
Display version information for Sqoop.
|
||||
|
||||
Syntax
|
||||
~~~~~~
|
||||
|
||||
----
|
||||
$ sqoop --connect jdbc:mysql://database.example.com/employees --list-tables
|
||||
employee_names
|
||||
payroll_checks
|
||||
job_descriptions
|
||||
office_supplies
|
||||
$ sqoop version
|
||||
$ sqoop-version
|
||||
----
|
||||
|
||||
|
||||
Example Invocations
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Display the version:
|
||||
|
||||
----
|
||||
$ sqoop version
|
||||
Sqoop 1.0.0
|
||||
git commit id 46b3e06b79a8411320d77c984c3030db47dd1c22
|
||||
Compiled by aaron@jargon on Mon May 17 13:43:22 PDT 2010
|
||||
----
|
||||
|
@ -1092,7 +1092,11 @@ public abstract static class Reader implements Closeable {
|
||||
public abstract Path getPath();
|
||||
|
||||
/**
|
||||
* Report the current position in the file.
|
||||
* Report the current position in the file. Note that the internal
|
||||
* cursor may move in an unpredictable fashion; e.g., to fetch
|
||||
* additional data from the index stored at the end of the file.
|
||||
* Clients may be more interested in the getRecordOffset() method
|
||||
* which returns the starting offset of the current record.
|
||||
* @return the current offset from the start of the file in bytes.
|
||||
*/
|
||||
public abstract long tell() throws IOException;
|
||||
|
@ -102,7 +102,7 @@ public abstract class BaseSqoopTool extends SqoopTool {
|
||||
public static final String PACKAGE_NAME_ARG = "package-name";
|
||||
public static final String CLASS_NAME_ARG = "class-name";
|
||||
public static final String JAR_FILE_NAME_ARG = "jar-file";
|
||||
public static final String DEBUG_SQL_ARG = "expr";
|
||||
public static final String DEBUG_SQL_ARG = "query";
|
||||
public static final String DEBUG_SQL_SHORT_ARG = "e";
|
||||
public static final String VERBOSE_ARG = "verbose";
|
||||
public static final String HELP_ARG = "help";
|
||||
@ -399,7 +399,7 @@ protected RelatedOptions getCodeGenOpts(boolean multiTable) {
|
||||
if (!multiTable) {
|
||||
codeGenOpts.addOption(OptionBuilder.withArgName("name")
|
||||
.hasArg()
|
||||
.withDescription("Sets the generated class name."
|
||||
.withDescription("Sets the generated class name. "
|
||||
+ "This overrides --" + PACKAGE_NAME_ARG + ". When combined "
|
||||
+ "with --" + JAR_FILE_NAME_ARG + ", sets the input class.")
|
||||
.withLongOpt(CLASS_NAME_ARG)
|
||||
|
Loading…
Reference in New Issue
Block a user