mirror of
https://github.com/apache/sqoop.git
synced 2025-05-20 02:40:52 +08:00
MAPREDUCE-906. Update Sqoop documentation. Contributed by Aaron Kimball
From: Christopher Douglas <cdouglas@apache.org> git-svn-id: https://svn.apache.org/repos/asf/incubator/sqoop/trunk@1149834 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
4a689cf491
commit
4b193ec488
@ -152,4 +152,12 @@ to call at top-level: ant deploy-contrib compile-core-test
|
||||
<fail if="tests.failed">Tests failed!</fail>
|
||||
</target>
|
||||
|
||||
<target name="doc">
|
||||
<exec executable="make" failonerror="true">
|
||||
<arg value="-C" />
|
||||
<arg value="${basedir}/doc" />
|
||||
<arg value="BUILDROOT=${build.dir}" />
|
||||
</exec>
|
||||
</target>
|
||||
|
||||
</project>
|
||||
|
17
doc/.gitignore
vendored
Normal file
17
doc/.gitignore
vendored
Normal file
@ -0,0 +1,17 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at#
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
/Sqoop-manpage.xml
|
||||
/sqoop.1
|
||||
/Sqoop-web.html
|
43
doc/Makefile
Normal file
43
doc/Makefile
Normal file
@ -0,0 +1,43 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at#
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
BUILDROOT=../../../../build/contrib/sqoop
|
||||
BUILD_DIR=$(BUILDROOT)/doc
|
||||
|
||||
all: man userguide
|
||||
|
||||
man: $(BUILD_DIR)/sqoop.1.gz
|
||||
|
||||
userguide: $(BUILD_DIR)/SqoopUserGuide.html
|
||||
|
||||
$(BUILD_DIR)/sqoop.1.gz: Sqoop-manpage.txt *formatting*.txt
|
||||
asciidoc -b docbook -d manpage Sqoop-manpage.txt
|
||||
xmlto man Sqoop-manpage.xml
|
||||
gzip sqoop.1
|
||||
rm Sqoop-manpage.xml
|
||||
mkdir -p $(BUILD_DIR)
|
||||
mv sqoop.1.gz $(BUILD_DIR)
|
||||
|
||||
$(BUILD_DIR)/SqoopUserGuide.html: SqoopUserGuide.txt *.txt
|
||||
asciidoc SqoopUserGuide.txt
|
||||
mkdir -p $(BUILD_DIR)
|
||||
mv SqoopUserGuide.html $(BUILD_DIR)
|
||||
|
||||
clean:
|
||||
-rm $(BUILD_DIR)/sqoop.1.gz
|
||||
-rm $(BUILD_DIR)/SqoopUserGuide.html
|
||||
|
||||
.PHONY: all man userguide clean
|
||||
|
177
doc/Sqoop-manpage.txt
Normal file
177
doc/Sqoop-manpage.txt
Normal file
@ -0,0 +1,177 @@
|
||||
sqoop(1)
|
||||
========
|
||||
|
||||
////
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
////
|
||||
|
||||
NAME
|
||||
----
|
||||
sqoop - SQL-to-Hadoop import tool
|
||||
|
||||
SYNOPSIS
|
||||
--------
|
||||
'sqoop' <options>
|
||||
|
||||
DESCRIPTION
|
||||
-----------
|
||||
Sqoop is a tool designed to help users of large data import existing
|
||||
relational databases into their Hadoop clusters. Sqoop uses JDBC to
|
||||
connect to a database, examine each table's schema, and auto-generate
|
||||
the necessary classes to import data into HDFS. It then instantiates
|
||||
a MapReduce job to read tables from the database via the DBInputFormat
|
||||
(JDBC-based InputFormat). Tables are read into a set of files loaded
|
||||
into HDFS. Both SequenceFile and text-based targets are supported. Sqoop
|
||||
also supports high-performance imports from select databases including MySQL.
|
||||
|
||||
OPTIONS
|
||||
-------
|
||||
|
||||
The +--connect+ option is always required. To perform an import, one of
|
||||
+--table+ or +--all-tables+ is required as well. Alternatively, you can
|
||||
specify +--generate-only+ or one of the arguments in "Additional commands."
|
||||
|
||||
|
||||
Database connection options
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
--connect (jdbc-uri)::
|
||||
Specify JDBC connect string (required)
|
||||
|
||||
--driver (class-name)::
|
||||
Manually specify JDBC driver class to use
|
||||
|
||||
--username (username)::
|
||||
Set authentication username
|
||||
|
||||
--password (password)::
|
||||
Set authentication password
|
||||
(Note: This is very insecure. You should use -P instead.)
|
||||
|
||||
-P::
|
||||
Prompt for user password
|
||||
|
||||
--direct::
|
||||
Use direct import fast path (mysql only)
|
||||
|
||||
Import control options
|
||||
~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
--all-tables::
|
||||
Import all tables in database
|
||||
(Ignores +--table+, +--columns+, +--order-by+, and +--where+)
|
||||
|
||||
--columns (col,col,col...)::
|
||||
Columns to export from table
|
||||
|
||||
--split-by (column-name)
|
||||
Column of the table used to split the table for parallel import
|
||||
|
||||
--hadoop-home (dir)::
|
||||
Override $HADOOP_HOME
|
||||
|
||||
--hive-home (dir)::
|
||||
Override $HIVE_HOME
|
||||
|
||||
--warehouse-dir (dir)::
|
||||
Tables are uploaded to the HDFS path +/warehouse/dir/(tablename)/+
|
||||
|
||||
--as-sequencefile::
|
||||
Imports data to SequenceFiles
|
||||
|
||||
--as-textfile::
|
||||
Imports data as plain text (default)
|
||||
|
||||
--hive-import::
|
||||
If set, then import the table into Hive
|
||||
|
||||
--table (table-name)::
|
||||
The table to import
|
||||
|
||||
--where (clause)
|
||||
Import only the rows for which _clause_ is true.
|
||||
e.g.: `--where "user_id > 400 AND hidden == 0"`
|
||||
|
||||
|
||||
Output line formatting options
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
include::output-formatting.txt[]
|
||||
include::output-formatting-args.txt[]
|
||||
|
||||
Input line parsing options
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
include::input-formatting.txt[]
|
||||
include::input-formatting-args.txt[]
|
||||
|
||||
Code generation options
|
||||
~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
--bindir (dir)::
|
||||
Output directory for compiled objects
|
||||
|
||||
--class-name (name)::
|
||||
Sets the name of the class to generate. By default, classes are
|
||||
named after the table they represent. Using this parameters
|
||||
ignores +--package-name+.
|
||||
|
||||
--generate-only::
|
||||
Stop after code generation; do not import
|
||||
|
||||
--outdir (dir)::
|
||||
Output directory for generated code
|
||||
|
||||
--package-name (package)::
|
||||
Puts auto-generated classes in the named Java package
|
||||
|
||||
Additional commands
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
These commands cause Sqoop to report information and exit;
|
||||
no import or code generation is performed.
|
||||
|
||||
--debug-sql (statement)::
|
||||
Execute 'statement' in SQL and display the results
|
||||
|
||||
--help::
|
||||
Display usage information and exit
|
||||
|
||||
--list-databases::
|
||||
List all databases available and exit
|
||||
|
||||
--list-tables::
|
||||
List tables in database and exit
|
||||
|
||||
|
||||
ENVIRONMENT
|
||||
-----------
|
||||
|
||||
JAVA_HOME::
|
||||
As part of its import process, Sqoop generates and compiles Java code
|
||||
by invoking the Java compiler *javac*(1). As a result, JAVA_HOME must
|
||||
be set to the location of your JDK (note: This cannot just be a JRE).
|
||||
e.g., +/usr/java/default+. Hadoop (and Sqoop) requires Sun Java 1.6 which
|
||||
can be downloaded from http://java.sun.com.
|
||||
|
||||
HADOOP_HOME::
|
||||
The location of the Hadoop jar files. If you installed Hadoop via RPM
|
||||
or DEB, these are in +/usr/lib/hadoop-20+.
|
||||
|
||||
HIVE_HOME::
|
||||
If you are performing a Hive import, you must identify the location of
|
||||
Hive's jars and configuration. If you installed Hive via RPM or DEB,
|
||||
these are in +/usr/lib/hive+.
|
||||
|
63
doc/SqoopUserGuide.txt
Normal file
63
doc/SqoopUserGuide.txt
Normal file
@ -0,0 +1,63 @@
|
||||
|
||||
////
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
////
|
||||
|
||||
|
||||
include::intro.txt[]
|
||||
|
||||
|
||||
The Sqoop Command Line
|
||||
----------------------
|
||||
|
||||
To execute Sqoop, run with Hadoop:
|
||||
----
|
||||
$ bin/hadoop jar contrib/sqoop/hadoop-$(version)-sqoop.jar (arguments)
|
||||
----
|
||||
|
||||
NOTE:Throughput this document, we will use `sqoop` as shorthand for the
|
||||
above. i.e., `$ sqoop (arguments)`
|
||||
|
||||
You pass this program options describing the
|
||||
import job you want to perform. If you need a hint, running Sqoop with
|
||||
`--help` will print out a list of all the command line
|
||||
options available. The +sqoop(1)+ manual page will also describe
|
||||
Sqoop's available arguments in greater detail. The manual page is built
|
||||
in `$HADOOP_HOME/build/contrib/sqoop/doc/sqoop.1.gz`.
|
||||
The following subsections will describe the most common modes of operation.
|
||||
|
||||
include::connecting.txt[]
|
||||
|
||||
include::listing-dbs.txt[]
|
||||
|
||||
include::listing-tables.txt[]
|
||||
|
||||
include::full-db-import.txt[]
|
||||
|
||||
include::table-import.txt[]
|
||||
|
||||
include::controlling-output-format.txt[]
|
||||
|
||||
include::classnames.txt[]
|
||||
|
||||
include::misc-args.txt[]
|
||||
|
||||
include::direct.txt[]
|
||||
|
||||
include::hive.txt[]
|
||||
|
||||
include::supported-dbs.txt[]
|
||||
|
43
doc/classnames.txt
Normal file
43
doc/classnames.txt
Normal file
@ -0,0 +1,43 @@
|
||||
|
||||
////
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
////
|
||||
|
||||
|
||||
Generated Class Names
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
By default, classes are named after the table they represent. e.g.,
|
||||
+sqoop --table foo+ will generate a file named +foo.java+. You can
|
||||
override the generated class name with the +--class-name+ argument.
|
||||
|
||||
----
|
||||
$ sqoop --connect jdbc:mysql://database.example.com/employees \
|
||||
--table employee_names --class-name com.example.EmployeeNames
|
||||
----
|
||||
_This generates a file named +com/example/EmployeeNames.java+_
|
||||
|
||||
If you want to specify a package name for generated classes, but
|
||||
still want them to be named after the table they represent, you
|
||||
can instead use the argument +--package-name+:
|
||||
|
||||
----
|
||||
$ sqoop --connect jdbc:mysql://database.example.com/employees \
|
||||
--table employee_names --package-name com.example
|
||||
----
|
||||
_This generates a file named +com/example/employee_names.java+_
|
||||
|
||||
|
85
doc/connecting.txt
Normal file
85
doc/connecting.txt
Normal file
@ -0,0 +1,85 @@
|
||||
|
||||
////
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
////
|
||||
|
||||
|
||||
Connecting to a Database Server
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Sqoop is designed to import tables from a database into HDFS. As such,
|
||||
it requires a _connect string_ that describes how to connect to the
|
||||
database. The _connect string_ looks like a URL, and is communicated to
|
||||
Sqoop with the +--connect+ argument. This describes the server and
|
||||
database to connect to; it may also specify the port. e.g.:
|
||||
|
||||
----
|
||||
$ sqoop --connect jdbc:mysql://database.example.com/employees
|
||||
----
|
||||
|
||||
This string will connect to a MySQL database named +employees+ on the
|
||||
host +database.example.com+. It's important that you *do not* use the URL
|
||||
+localhost+ if you intend to use Sqoop with a distributed Hadoop
|
||||
cluster. The connect string you supply will be used on TaskTracker nodes
|
||||
throughout your MapReduce cluster; if they're told to connect to the
|
||||
literal name +localhost+, they'll each reach a different
|
||||
database (or more likely, no database at all)! Instead, you should use
|
||||
the full hostname or IP address of the database host that can be seen
|
||||
by all your remote nodes.
|
||||
|
||||
You may need to authenticate against the database before you can
|
||||
access it. The +--username+ and +--password+ or +-P+ parameters can
|
||||
be used to supply a username and a password to the database. e.g.:
|
||||
|
||||
----
|
||||
$ sqoop --connect jdbc:mysql://database.example.com/employees \
|
||||
--username aaron --password 12345
|
||||
----
|
||||
|
||||
.Password security
|
||||
WARNING: The +--password+ parameter is insecure, as other users may
|
||||
be able to read your password from the command-line arguments via
|
||||
the output of programs such as `ps`. The *+-P+* argument will read
|
||||
a password from a console prompt, and is the preferred method of
|
||||
entering credentials. Credentials may still be transferred between
|
||||
nodes of the MapReduce cluster using insecure means.
|
||||
|
||||
Sqoop automatically supports several databases, including MySQL. Connect strings beginning
|
||||
with +jdbc:mysql://+ are handled automatically Sqoop, though you may need
|
||||
to install the driver yourself. (A full list of databases with
|
||||
built-in support is provided in the "Supported Databases" section, below.)
|
||||
|
||||
You can use Sqoop with any other
|
||||
JDBC-compliant database as well. First, download the appropriate JDBC
|
||||
driver for the database you want to import from, and install the .jar
|
||||
file in the +/usr/hadoop/lib+ directory on all machines in your Hadoop
|
||||
cluster, or some other directory which is in the classpath
|
||||
on all nodes. Each driver jar also has a specific driver class which defines
|
||||
the entry-point to the driver. For example, MySQL's Connector/J library has
|
||||
a driver class of +com.mysql.jdbc.Driver+. Refer to your database
|
||||
vendor-specific documentation to determine the main driver class.
|
||||
This class must be provided as an argument to Sqoop with +--driver+.
|
||||
|
||||
For example, to connect to a postgres database, first download the driver from
|
||||
link:http://jdbc.postgresql.org[http://jdbc.postgresql.org] and
|
||||
install it in your Hadoop lib path.
|
||||
Then run Sqoop with something like:
|
||||
|
||||
----
|
||||
$ sqoop --connect jdbc:postgresql://postgres-server.example.com/employees \
|
||||
--driver org.postgresql.Driver
|
||||
----
|
||||
|
42
doc/controlling-input-format.txt
Normal file
42
doc/controlling-input-format.txt
Normal file
@ -0,0 +1,42 @@
|
||||
|
||||
////
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
////
|
||||
|
||||
|
||||
Controlling the Input Format
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
include::input-formatting.txt[]
|
||||
|
||||
The following arguments allow you to control the input format of
|
||||
records:
|
||||
|
||||
include::input-formatting-args.txt[]
|
||||
|
||||
If you have already imported data into HDFS in a text-based
|
||||
representation and want to change the delimiters being used, you
|
||||
should regenerate the class via `sqoop --generate-only`, specifying
|
||||
the new delimiters with +--fields-terminated-by+, etc., and the old
|
||||
delimiters with +--input-fields-terminated-by+, etc. Then run a
|
||||
MapReduce job where your mapper creates an instance of your record
|
||||
class, uses its +parse()+ method to read the fields using the old
|
||||
delimiters, and emits a new +Text+ output value via the record's
|
||||
+toString()+ method, which will use the new delimiters. You'll then
|
||||
want to regenerate the class another time without the
|
||||
+--input-fields-terminated-by+ specified so that the new delimiters
|
||||
are used for both input and output.
|
||||
|
38
doc/controlling-output-format.txt
Normal file
38
doc/controlling-output-format.txt
Normal file
@ -0,0 +1,38 @@
|
||||
|
||||
////
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
////
|
||||
|
||||
|
||||
Controlling the Output Format
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
include::output-formatting.txt[]
|
||||
|
||||
The following arguments allow you to control the output format of
|
||||
records:
|
||||
|
||||
include::output-formatting-args.txt[]
|
||||
|
||||
For example, we may want to separate records by tab characters, with
|
||||
every record surrounded by "double quotes", and internal quote marks
|
||||
escaped by a backslash (+\+) character:
|
||||
|
||||
----
|
||||
$ sqoop --connect jdbc:mysql://database.example.com/employees \
|
||||
--table employee_names --fields-terminated-by \t \
|
||||
--lines-terminated-by \n --enclosed-by '\"' --escaped-by '\\'
|
||||
----
|
||||
|
51
doc/direct.txt
Normal file
51
doc/direct.txt
Normal file
@ -0,0 +1,51 @@
|
||||
|
||||
////
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
////
|
||||
|
||||
|
||||
Direct-mode Imports
|
||||
------------------
|
||||
|
||||
While the JDBC-based import method used by Sqoop provides it with the
|
||||
ability to read from a variety of databases using a generic driver, it
|
||||
is not the most high-performance method available. Sqoop can read from
|
||||
certain database systems faster by using their built-in export tools.
|
||||
|
||||
For example, Sqoop can read from a local MySQL database by using the +mysqldump+
|
||||
tool distributed with MySQL. If you run Sqoop on the same machine where a
|
||||
MySQL database is present, you can take advantage of this faster
|
||||
import method by running Sqoop with the +--direct+ argument. This
|
||||
combined with a connect string that begins with +jdbc:mysql://+ will
|
||||
inform Sqoop that it should select the faster access method.
|
||||
|
||||
If your delimiters exactly match the delimiters used by +mysqldump+,
|
||||
then Sqoop will use a fast-path that copies the data directly from
|
||||
+mysqldump+'s output into HDFS. Otherwise, Sqoop will parse +mysqldump+'s
|
||||
output into fields and transcode them into the user-specified delimiter set.
|
||||
This incurs additional processing, so performance may suffer.
|
||||
For convenience, the +--mysql-delimiters+
|
||||
argument will set all the output delimiters to be consistent with
|
||||
+mysqldump+'s format.
|
||||
|
||||
Sqoop also provides a direct-mode backend for PostgreSQL that uses the
|
||||
+COPY TO STDOUT+ protocol from +psql+. No specific delimiter set provides
|
||||
better performance; Sqoop will forward delimiter control arguments to
|
||||
+psql+.
|
||||
|
||||
The "Supported Databases" section provides a full list of database vendors
|
||||
which have direct-mode support from Sqoop.
|
||||
|
92
doc/full-db-import.txt
Normal file
92
doc/full-db-import.txt
Normal file
@ -0,0 +1,92 @@
|
||||
|
||||
////
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
////
|
||||
|
||||
|
||||
Automatic Full-database Import
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
If you want to import all the tables in a database, you can use the
|
||||
+--all-tables+ command to do so:
|
||||
|
||||
----
|
||||
$ sqoop --connect jdbc:mysql://database.example.com/employees --all-tables
|
||||
----
|
||||
|
||||
This will query the database for the available tables, generate an ORM
|
||||
class for each table, and run a MapReduce job to import each one.
|
||||
Hadoop uses the DBInputFormat to read from a database into a Mapper
|
||||
instance. To read a table into a MapReduce program requires creating a
|
||||
class to hold the fields of one row of the table. One of the benefits
|
||||
of Sqoop is that it generates this class definition for you, based on
|
||||
the table definition in the database.
|
||||
|
||||
The generated +.java+ files are, by default, placed in the current
|
||||
directory. You can supply a different directory with the +--outdir+
|
||||
parameter. These are then compiled into +.class+ and +.jar+ files for use
|
||||
by the MapReduce job that it launches. These files are created in a
|
||||
temporary directory. You can redirect this target with +--bindir+.
|
||||
|
||||
Each table will be imported into a separate directory in HDFS, with
|
||||
the same name as the table. For instance, if my Hadoop username is
|
||||
aaron, the above command would have generated the following
|
||||
directories in HDFS:
|
||||
|
||||
----
|
||||
/user/aaron/employee_names
|
||||
/user/aaron/payroll_checks
|
||||
/user/aaron/job_descriptions
|
||||
/user/aaron/office_supplies
|
||||
----
|
||||
|
||||
You can change the base directory under which the tables are loaded
|
||||
with the +--warehouse-dir+ parameter. For example:
|
||||
|
||||
----
|
||||
$ sqoop --connect jdbc:mysql://database.example.com/employees --all-tables \
|
||||
--warehouse-dir /common/warehouse
|
||||
----
|
||||
|
||||
This would create the following directories instead:
|
||||
|
||||
----
|
||||
/common/warehouse/employee_names
|
||||
/common/warehouse/payroll_checks
|
||||
/common/warehouse/job_descriptions
|
||||
/common/warehouse/office_supplies
|
||||
----
|
||||
|
||||
By default the data will be read into text files in HDFS. Each of the
|
||||
columns will be represented as comma-delimited text. Each row is
|
||||
terminated by a newline. See the section on "Controlling the Output
|
||||
Format" below for information on how to change these delimiters.
|
||||
|
||||
If you want to leverage compression and binary file formats, the
|
||||
+--as-sequencefile+ argument to Sqoop will import the table
|
||||
to a set of SequenceFiles instead. This stores each field of each
|
||||
database record in a separate object in a SequenceFile.
|
||||
This representation is also likely to be higher performance when used
|
||||
as an input to subsequent MapReduce programs as it does not require
|
||||
parsing. For completeness, Sqoop provides an +--as-textfile+ option, which is
|
||||
implied by default. An +--as-textfile+ on the command-line will override
|
||||
a previous +--as-sequencefile+ argument.
|
||||
|
||||
The SequenceFile format will embed the records from the database as
|
||||
objects using the code generated by Sqoop. It is important that you
|
||||
retain the +.java+ file for this class, as you will need to be able to
|
||||
instantiate the same type to read the objects back later, in other
|
||||
user-defined applications.
|
||||
|
58
doc/hive.txt
Normal file
58
doc/hive.txt
Normal file
@ -0,0 +1,58 @@
|
||||
|
||||
////
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
////
|
||||
|
||||
|
||||
Importing Data Into Hive
|
||||
------------------------
|
||||
|
||||
Sqoop's primary function is to upload your data into files in HDFS. If
|
||||
you have a Hive metastore associated with your HDFS cluster, Sqoop can
|
||||
also import the data into Hive by generating and executing a +CREATE
|
||||
TABLE+ statement to define the data's layout in Hive. Importing data
|
||||
into Hive is as simple as adding the *+--hive-import+* option to your
|
||||
Sqoop command line.
|
||||
|
||||
After your data is imported into HDFS, Sqoop will generate a Hive
|
||||
script containing a +CREATE TABLE+ operation defining your columns using
|
||||
Hive's types, and a +LOAD DATA INPATH+ statement to move the data files
|
||||
into Hive's warehouse directory. The script will be executed by
|
||||
calling the installed copy of hive on the machine where Sqoop is run.
|
||||
If you have multiple Hive installations, or +hive+ is not in your
|
||||
+$PATH+ use the *+--hive-home+* option to identify the Hive installation
|
||||
directory. Sqoop will use +$HIVE_HOME/bin/hive+ from here.
|
||||
|
||||
NOTE: This function is incompatible with +--as-sequencefile+.
|
||||
|
||||
Hive's text parser does not know how to support escaping or enclosing
|
||||
characters. Sqoop will print a warning if you use +--escaped-by+,
|
||||
+--enclosed-by+, or +--optionally-enclosed-by+ since Hive does not know
|
||||
how to parse these. It will pass the field and record terminators through
|
||||
to Hive. If you do not set any delimiters and do use +--hive-import+,
|
||||
the field delimiter will be set to +^A+ and the record delimiter will
|
||||
be set to +\n+ to be consistent with Hive's defaults.
|
||||
|
||||
Hive's Type System
|
||||
~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Hive users will note that there is not a one-to-one mapping between
|
||||
SQL types and Hive types. In general, SQL types that do not have a
|
||||
direct mapping (e.g., +DATE+, +TIME+, and +TIMESTAMP+) will be coerced to
|
||||
+STRING+ in Hive. The +NUMERIC+ and +DECIMAL+ SQL types will be coerced to
|
||||
+DOUBLE+. In these cases, Sqoop will emit a warning in its log messages
|
||||
informing you of the loss of precision.
|
||||
|
34
doc/input-formatting-args.txt
Normal file
34
doc/input-formatting-args.txt
Normal file
@ -0,0 +1,34 @@
|
||||
|
||||
////
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
////
|
||||
|
||||
|
||||
--input-fields-terminated-by (char)::
|
||||
Sets the input field separator
|
||||
|
||||
--input-lines-terminated-by (char)::
|
||||
Sets the input end-of-line char
|
||||
|
||||
--input-optionally-enclosed-by (char)::
|
||||
Sets an input field-enclosing character
|
||||
|
||||
--input-enclosed-by (char)::
|
||||
Sets a required input field encloser
|
||||
|
||||
--input-escaped-by (char)::
|
||||
Sets the input escape character
|
||||
|
24
doc/input-formatting.txt
Normal file
24
doc/input-formatting.txt
Normal file
@ -0,0 +1,24 @@
|
||||
|
||||
////
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
////
|
||||
|
||||
Record classes generated by Sqoop include both a +toString()+ method
|
||||
that formats output records, and a +parse()+ method that interprets
|
||||
text based on an input delimiter set. The input delimiters default to
|
||||
the same ones chosen for output delimiters, but you can override these
|
||||
settings to support converting from one set of delimiters to another.
|
||||
|
34
doc/intro.txt
Normal file
34
doc/intro.txt
Normal file
@ -0,0 +1,34 @@
|
||||
|
||||
////
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
////
|
||||
|
||||
|
||||
Introduction
|
||||
------------
|
||||
|
||||
Sqoop is a tool designed to help users of large data import
|
||||
existing relational databases into their Hadoop clusters. Sqoop uses
|
||||
JDBC to connect to a database, examine each table's schema, and
|
||||
auto-generate the necessary classes to import data into HDFS. It
|
||||
then instantiates a MapReduce job to read tables from the database
|
||||
via the DBInputFormat (JDBC-based InputFormat). Tables are read
|
||||
into a set of files loaded into HDFS. Both SequenceFile and
|
||||
text-based targets are supported. Sqoop also supports high-performance
|
||||
imports from select databases including MySQL.
|
||||
|
||||
This document describes how to get started using Sqoop to import
|
||||
your data into Hadoop.
|
35
doc/listing-dbs.txt
Normal file
35
doc/listing-dbs.txt
Normal file
@ -0,0 +1,35 @@
|
||||
|
||||
////
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
////
|
||||
|
||||
|
||||
Listing Available Databases
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Once connected to a database server, you can list the available
|
||||
databases with the +--list-databases+ parameter. This currently is supported
|
||||
only by HSQLDB and MySQL. Note that in this case, the connect string does
|
||||
not include a database name, just a server address.
|
||||
|
||||
----
|
||||
$ sqoop --connect jdbc:mysql://database.example.com/ --list-databases
|
||||
information_schema
|
||||
employees
|
||||
----
|
||||
_This only works with HSQLDB and MySQL. A vendor-agnostic implementation of
|
||||
this function has not yet been implemented._
|
||||
|
34
doc/listing-tables.txt
Normal file
34
doc/listing-tables.txt
Normal file
@ -0,0 +1,34 @@
|
||||
|
||||
////
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
////
|
||||
|
||||
|
||||
Listing Available Tables
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Within a database, you can list the tables available for import with
|
||||
the +--list-tables+ command. The following example shows four tables available
|
||||
within the "employees" example database:
|
||||
|
||||
----
|
||||
$ sqoop --connect jdbc:mysql://database.example.com/employees --list-tables
|
||||
employee_names
|
||||
payroll_checks
|
||||
job_descriptions
|
||||
office_supplies
|
||||
----
|
||||
|
32
doc/misc-args.txt
Normal file
32
doc/misc-args.txt
Normal file
@ -0,0 +1,32 @@
|
||||
|
||||
////
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
////
|
||||
|
||||
|
||||
Miscellaneous Additional Arguments
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
If you want to generate the Java classes to represent tables without
|
||||
actually performing an import, supply a connect string and
|
||||
(optionally) credentials as above, as well as +--all-tables+ or
|
||||
+--table+, but also use the +--generate-only+ argument. This will
|
||||
generate the classes and cease further operation.
|
||||
|
||||
You can override the +$HADOOP_HOME+ environment variable within Sqoop
|
||||
with the +--hadoop-home+ argument. You can override the +$HIVE_HOME+
|
||||
environment variable with +--hive-home+.
|
||||
|
39
doc/output-formatting-args.txt
Normal file
39
doc/output-formatting-args.txt
Normal file
@ -0,0 +1,39 @@
|
||||
|
||||
////
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
////
|
||||
|
||||
--fields-terminated-by (char)::
|
||||
Sets the field separator character
|
||||
|
||||
--lines-terminated-by (char)::
|
||||
Sets the end-of-line character
|
||||
|
||||
--optionally-enclosed-by (char)::
|
||||
Sets a field-enclosing character which may be used if a
|
||||
value contains delimiter characters.
|
||||
|
||||
--enclosed-by (char)::
|
||||
Sets a field-enclosing character which will be used for all fields.
|
||||
|
||||
--escaped-by (char)::
|
||||
Sets the escape character
|
||||
|
||||
--mysql-delimiters::
|
||||
Uses MySQL's default delimiter set:
|
||||
+
|
||||
fields: , lines: \n escaped-by: \ optionally-enclosed-by: '
|
||||
|
44
doc/output-formatting.txt
Normal file
44
doc/output-formatting.txt
Normal file
@ -0,0 +1,44 @@
|
||||
|
||||
////
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
////
|
||||
|
||||
The delimiters used to separate fields and records can be specified
|
||||
on the command line, as can a quoting character and an escape character
|
||||
(for quoting delimiters inside a values). Data imported with
|
||||
+--as-textfile+ will be formatted according to these parameters. Classes
|
||||
generated by Sqoop will encode this information, so using +toString()+
|
||||
from a data record stored +--as-sequencefile+ will reproduce your
|
||||
specified formatting.
|
||||
|
||||
The +(char)+ argument for each argument in this section can be specified
|
||||
either as a normal character (e.g., +--fields-terminated-by ,+) or via
|
||||
an escape sequence. Arguments of the form +\0xhhh+ will be interpreted
|
||||
as a hexidecimal representation of a character with hex number _hhh_.
|
||||
Arguments of the form +\0ooo+ will be treated as an octal representation
|
||||
of a character represented by octal number _ooo_. The special escapes
|
||||
+\n+, +\r+, +\"+, +\b+, +\t+, and +\\+ act as they do inside Java strings. +\0+ will be
|
||||
treated as NUL. This will insert NUL characters between fields or lines
|
||||
(if used for +--fields-terminated-by+ or +--lines-terminated-by+), or will
|
||||
disable enclosing/escaping if used for one of the +--enclosed-by+,
|
||||
+--optionally-enclosed-by+, or +--escaped-by+ arguments.
|
||||
|
||||
The default delimiters are +,+ for fields, +\n+ for records, no quote
|
||||
character, and no escape character. Note that this can lead to
|
||||
ambiguous/unparsible records if you import database records containing
|
||||
commas or newlines in the field data. For unambiguous parsing, both must
|
||||
be enabled, e.g., via +--mysql-delimiters+.
|
||||
|
55
doc/supported-dbs.txt
Normal file
55
doc/supported-dbs.txt
Normal file
@ -0,0 +1,55 @@
|
||||
|
||||
////
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
////
|
||||
|
||||
|
||||
Supported Databases
|
||||
-------------------
|
||||
|
||||
Sqoop uses JDBC to connect to databases. JDBC is a compatibility layer
|
||||
that allows a program to access many different databases through a common
|
||||
API. Slight differences in the SQL language spoken by each database, however,
|
||||
may mean that Sqoop can't use every database out of the box, or that some
|
||||
databases may be used in an inefficient manner.
|
||||
|
||||
When you provide a connect string to Sqoop, it inspects the protocol scheme to
|
||||
determine appropriate vendor-specific logic to use. If Sqoop knows about
|
||||
a given database, it will work automatically. If not, you may need to
|
||||
specify the driver class to load via +--driver+. This will use a generic
|
||||
code path which will use standard SQL to access the database. Sqoop provides
|
||||
some databases with faster, non-JDBC-based access mechanisms. These can be
|
||||
enabled by specfying the +--direct+ parameter.
|
||||
|
||||
Sqoop includes vendor-specific code paths for the following databases:
|
||||
|
||||
[grid="all"]
|
||||
`-----------`--------`--------------------`---------------------
|
||||
Database version +--direct+ support? connect string matches
|
||||
----------------------------------------------------------------
|
||||
HSQLDB 1.8.0+ No +jdbc:hsqldb:*//+
|
||||
MySQL 5.0+ Yes +jdbc:mysql://+
|
||||
Oracle 10.2.0+ No +jdbc:oracle:*//+
|
||||
PostgreSQL 8.3+ Yes +jdbc:postgresql://+
|
||||
----------------------------------------------------------------
|
||||
|
||||
Sqoop may work with older versions of the databases listed, but we have
|
||||
only tested it with the versions specified above.
|
||||
|
||||
Even if Sqoop supports a database internally, you may still need to
|
||||
install the database vendor's JDBC driver in your +$HADOOP_HOME/lib+
|
||||
path.
|
||||
|
68
doc/table-import.txt
Normal file
68
doc/table-import.txt
Normal file
@ -0,0 +1,68 @@
|
||||
|
||||
////
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
////
|
||||
|
||||
|
||||
Importing Individual Tables
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
In addition to full-database imports, Sqoop will allow you to import
|
||||
individual tables. Instead of using +--all-tables+, specify the name of
|
||||
a particular table with the +--table+ argument:
|
||||
|
||||
----
|
||||
$ sqoop --connect jdbc:mysql://database.example.com/employees \
|
||||
--table employee_names
|
||||
----
|
||||
|
||||
You can further specify a subset of the columns in a table by using
|
||||
the +--columns+ argument. This takes a list of column names, delimited
|
||||
by commas, with no spaces in between. e.g.:
|
||||
|
||||
----
|
||||
$ sqoop --connect jdbc:mysql://database.example.com/employees \
|
||||
--table employee_names --columns employee_id,first_name,last_name,dept_id
|
||||
----
|
||||
|
||||
Sqoop will use a MapReduce job to read sections of the table in
|
||||
parallel. For the MapReduce tasks to divide the table space, the
|
||||
results returned by the database must be orderable. Sqoop will
|
||||
automatically detect the primary key for a table and use that to order
|
||||
the results. If no primary key is available, or (less likely) you want
|
||||
to order the results along a different column, you can specify the
|
||||
column name with +--split-by+.
|
||||
|
||||
.Row ordering
|
||||
IMPORTANT: To guarantee correctness of your input, you must select an
|
||||
ordering column for which each row has a unique value. If duplicate
|
||||
values appear in the ordering column, the results of the import are
|
||||
undefined, and Sqoop will not be able to detect the error.
|
||||
|
||||
Finally, you can control which rows of a table are imported via the
|
||||
+--where+ argument. With this argument, you may specify a clause to be
|
||||
appended to the SQL statement used to select rows from the table,
|
||||
e.g.:
|
||||
|
||||
----
|
||||
$ sqoop --connect jdbc:mysql://database.example.com/employees \
|
||||
--table employee_names --where "employee_id > 40 AND active = 1"
|
||||
----
|
||||
|
||||
The +--columns+, +--split-by+, and +--where+ arguments are incompatible with
|
||||
+--all-tables+. If you require special handling for some of the tables,
|
||||
then you must manually run a separate import job for each table.
|
||||
|
186
readme.html
186
readme.html
@ -1,186 +0,0 @@
|
||||
<html>
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<head>
|
||||
<title>Sqoop User's Guide</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1><a name="SqoopUsersGuide-Sqoop"></a>Sqoop</h1>
|
||||
|
||||
<h2><a name="SqoopUsersGuide-Overview"></a>Overview</h2>
|
||||
|
||||
<p>Sqoop is a tool designed to help users of large data import existing relational databases into their Hadoop clusters. Sqoop uses JDBC to connect to a database, examine the schema for tables, and auto-generate the necessary classes to import data into HDFS. It then instantiates a MapReduce job to read the table from the database via the DBInputFormat (JDBC-based InputFormat). The table is read into a set of files loaded into HDFS. Both SequenceFile and text-based targets are supported.</p>
|
||||
|
||||
<p>Longer term, Sqoop will support automatic connectivity to Hive, with the ability to load data files directly into the Hive warehouse directory, and also to inject the appropriate table definition into the metastore.</p>
|
||||
|
||||
<h2><a name="SqoopUsersGuide-GettingStarted"></a>Getting Started</h2>
|
||||
|
||||
<p><b>Getting Sqoop</b> Sqoop is distributed as a "contrib" jar with Hadoop. It is built in the <tt>contrib/sqoop/</tt> directory.</p>
|
||||
|
||||
<p>You can run Sqoop by running:</p>
|
||||
|
||||
<div class="code panel" style="border-width: 1px;"><div class="codeContent panelContent">
|
||||
<pre class="code-java">
|
||||
$ hadoop jar /path/to/sqoop.jar org.apache.hadoop.sqoop.Sqoop (options)
|
||||
</pre>
|
||||
</div></div>
|
||||
|
||||
|
||||
<p>This does nothing of interest without any options. The <tt>--help</tt> option displays the full usage instructions.</p>
|
||||
|
||||
<h3><a name="SqoopUsersGuide-ConnectingtoaDatabaseServer"></a>Connecting to a Database Server</h3>
|
||||
|
||||
<p>Sqoop is designed to import tables from a database into HDFS. As such, it requires a <em>connect string</em> that describes how to connect to the database. The <em>connect string</em> looks like a URL, and is communicated to Sqoop with the <tt>--connect</tt> argument. This describes the server and database to connect to; it may also specify the port. e.g.: </p>
|
||||
|
||||
<div class="code panel" style="border-width: 1px;"><div class="codeContent panelContent">
|
||||
|
||||
<pre class="code-java">
|
||||
$ hadoop jar /path/to/sqoop.jar org.apache.hadoop.sqoop.Sqoop --connect jdbc:mysql:<span class="code-comment">//database.example.com/employees</span>
|
||||
</pre>
|
||||
</div></div>
|
||||
|
||||
<p>This string will connect to a MySQL database named <tt>employees</tt> on the host <tt>database.example.com</tt>. It's important that you <b>do not</b> use the URL <tt>localhost</tt> if you intend to use Sqoop with a distributed Hadoop cluster. The connect string you supply will be used on all the TaskTracker nodes in your MapReduce cluster; if they're told to connect to the literal name <tt>localhost</tt>, they'll each reach a different database (or more likely, no database at all)! Instead, you should use the full DNS or IP address of the database host that can be seen by all your remote nodes.</p>
|
||||
|
||||
<p>You may need to authenticate against the database before you can access it. The <tt>--username</tt> and <tt>--password</tt> parameters can be used to supply a username and a password to the database. (Note: password access currently requires passing the password on the command-line, which is insecure.) e.g.:</p>
|
||||
|
||||
<div class="code panel" style="border-width: 1px;"><div class="codeContent panelContent">
|
||||
<pre class="code-java">
|
||||
$ hadoop jar /path/to/sqoop.jar org.apache.hadoop.sqoop.Sqoop --connect jdbc:mysql:<span class="code-comment">//database.example.com/employees --username aaron --password 12345</span>
|
||||
</pre>
|
||||
</div></div>
|
||||
|
||||
<p>Sqoop automatically supports <span class="nobr"><a href="http://www.mysql.com" rel="nofollow">MySQL</a></span> and <span class="nobr"><a href="http://hsqldb.org/" rel="nofollow">HSQLDB</a></span>. Connect strings beginning with <tt>jdbc:mysql://</tt> and <tt>jdbc:hsqldb:hsql://</tt> automatically inform Sqoop of the correct JDBC driver class to load. HSQLDB's JDBC driver is bundled with Hadoop, and so will work "out of the box." If you install <a href="http://dev.mysql.com/downloads/connector/j/5.1.html">MySQL's Connector/J driver</a> in Hadoop's <tt>lib/</tt> directory, Sqoop will also automatically take advantage of this for any <tt>jdbc:mysql://</tt> connect strings you use. You can use Sqoop with any other JDBC-compliant database as well. First, download the appropriate JDBC driver for the database you want to import from, and install the <tt>.jar</tt> file in the <tt>$HADOOP_HOME/lib</tt> directory on all machines in your Hadoop cluster, or some other directory which is in the classpath on all nodes. Each driver jar also has a specific <em>driver class</em> which defines the entry-point to the driver. For example, MySQL's Connector/J library has a driver class of <tt>com.mysql.jdbc.Driver</tt>. Refer to your database vendor-specific documentation to determine the main driver class. This class must be provided as an argument to Sqoop with <tt>--driver</tt>.</p>
|
||||
|
||||
<p>For example, to connect to a postgres database, first download the driver from <span class="nobr"><a href="http://jdbc.postgresql.org" rel="nofollow">http://jdbc.postgresql.org</a></span> and install it in your Hadoop lib path. Then run Sqoop with something like:</p>
|
||||
|
||||
<div class="code panel" style="border-width: 1px;"><div class="codeContent panelContent">
|
||||
<pre class="code-java">
|
||||
$ hadoop jar /path/to/sqoop.jar org.apache.hadoop.sqoop.Sqoop --connect jdbc:postgresql:<span class="code-comment">//postgres-server.example.com/employees --driver org.postgresql.Driver</span>
|
||||
</pre>
|
||||
</div></div>
|
||||
|
||||
<p>Note: Sqoop uses the JDBC specification to connect to databases; this should provide a versatile client that interoperates with many different databases. That having been said, we have only thoroughly tested this tool with HSQLDB and MySQL.</p>
|
||||
|
||||
<h3><a name="SqoopUsersGuide-ListingAvailableDatabases"></a>Listing Available Databases</h3>
|
||||
|
||||
<p>Once connected to a database server, you can list the available databases with the <tt>--list-databases</tt> parameter. This currently is supported only by HSQLDB and MySQL. Note that in this case, the connect string does not include a database name, just a server address.</p>
|
||||
|
||||
<div class="code panel" style="border-width: 1px;"><div class="codeContent panelContent">
|
||||
<pre class="code-java">
|
||||
$ hadoop jar /path/to/sqoop.jar org.apache.hadoop.sqoop.Sqoop --connect jdbc:mysql:<span class="code-comment">//database.example.com/ --list-databases
|
||||
</span>information_schema
|
||||
employees
|
||||
</pre>
|
||||
</div></div>
|
||||
|
||||
<p><em>This only works with HSQLDB and MySQL. A vendor-agnostic implementation of this function has not yet been implemented.</em></p>
|
||||
|
||||
<h3><a name="SqoopUsersGuide-ListingAvailableTables"></a>Listing Available Tables</h3>
|
||||
|
||||
<p>Within a database, you can list the tables available for import with the <tt>--list-tables</tt> command. The following example shows four tables available within the "employees" example database:</p>
|
||||
|
||||
<div class="code panel" style="border-width: 1px;"><div class="codeContent panelContent">
|
||||
<pre class="code-java">
|
||||
$ hadoop jar /path/to/sqoop.jar org.apache.hadoop.sqoop.Sqoop --connect jdbc:mysql:<span class="code-comment">//database.example.com/employees --list-tables
|
||||
|
||||
</span>employee_names
|
||||
payroll_checks
|
||||
job_descriptions
|
||||
office_supplies
|
||||
</pre>
|
||||
</div></div>
|
||||
|
||||
<h2><a name="SqoopUsersGuide-AutomaticFulldatabaseImport"></a>Automatic Full-database Import</h2>
|
||||
|
||||
<p>If you want to import all the tables in a database, you can use the <tt>--all-tables</tt> command to do so:</p>
|
||||
|
||||
<div class="code panel" style="border-width: 1px;"><div class="codeContent panelContent">
|
||||
<pre class="code-java">
|
||||
$ hadoop jar /path/to/sqoop.jar org.apache.hadoop.sqoop.Sqoop --connect jdbc:mysql:<span class="code-comment">//database.example.com/employees --all-tables</span>
|
||||
|
||||
</pre>
|
||||
</div></div>
|
||||
|
||||
<p>This will query the database for the available tables, generate an ORM class for each table, and run a MapReduce job to import each one. Hadoop uses the <span class="nobr"><a href="http://issues.apache.org/jira/browse/HADOOP-2536" rel="nofollow">DBInputFormat</a></span> to read from a database into a Mapper instance. To read a table into a MapReduce program requires creating a class to hold the fields of one row of the table. One of the benefits of Sqoop is that it generates this class definition for you, based on the table definition in the database. </p>
|
||||
|
||||
<p>The generated <tt>.java</tt> files are, by default, placed in the current directory. You can supply a different directory with the <tt>--outdir</tt> parameter. These are then compiled into <tt>.class</tt> and <tt>.jar</tt> files for use by the MapReduce job that it launches. These files are created in a temporary directory. You can redirect this target with <tt>--bindir</tt>.</p>
|
||||
|
||||
<p>Each table will be imported into a separate directory in HDFS, with the same name as the table. For instance, if my Hadoop username is <tt>aaron</tt>, the above command would have generated the following directories in HDFS:</p>
|
||||
|
||||
<div class="code panel" style="border-width: 1px;"><div class="codeContent panelContent">
|
||||
<pre class="code-java">
|
||||
/user/aaron/employee_names
|
||||
/user/aaron/payroll_checks
|
||||
/user/aaron/job_descriptions
|
||||
/user/aaron/office_supplies
|
||||
</pre>
|
||||
</div></div>
|
||||
|
||||
<p>You can change the base directory under which the tables are loaded with the <tt>--warehouse-dir</tt> parameter. For example:</p>
|
||||
|
||||
<div class="code panel" style="border-width: 1px;"><div class="codeContent panelContent">
|
||||
<pre class="code-java">
|
||||
$ hadoop jar /path/to/sqoop.jar org.apache.hadoop.sqoop.Sqoop --connect jdbc:mysql:<span class="code-comment">//database.example.com/employees --all-tables --warehouse-dir /common/warehouse</span>
|
||||
</pre>
|
||||
</div></div>
|
||||
|
||||
<p>This would create the following directories instead:</p>
|
||||
<div class="code panel" style="border-width: 1px;"><div class="codeContent panelContent">
|
||||
<pre class="code-java">
|
||||
/common/warehouse/employee_names
|
||||
/common/warehouse/payroll_checks
|
||||
/common/warehouse/job_descriptions
|
||||
/common/warehouse/office_supplies
|
||||
</pre>
|
||||
</div></div>
|
||||
|
||||
<p>By default the data will be read into text files in HDFS. Each of the columns will be represented as comma-delimited text. Each row is terminated by a newline. There is currently no mechanism to quote or escape commas or newlines inside of <tt>CHAR</tt> or <tt>VARCHAR</tt> columns of the database. Applications which depend on comma-delimited parsing of the output files must be careful if commas or newlines may be present in the database. </p>
|
||||
|
||||
<p>If you expect commas or newlines to appear in text columns of the database, or you want to leverage compression and binary file formats, the <tt>--as-sequencefile</tt> argument to Sqoop will import the table to a set of SequenceFiles instead. As this uses a separate object for each field of each database record, no quoting or escaping of values is necessary. This representation is also likely to be higher performance when used as an input to subsequent MapReduce programs. For completeness, Sqoop provides an <tt>--as-textfile</tt> option, which is implied by default. An <tt>--as-textfile</tt> on the command-line will override a previous <tt>--as-sequencefile</tt> argument.</p>
|
||||
|
||||
<p>The SequenceFile format will embed the records from the database as objects using the code generated by Sqoop. It is important that you retain the <tt>.java file</tt> for this class, as you will need to be able to instantiate the same type to read the objects back later, in other user-defined applications.</p>
|
||||
|
||||
<h2><a name="SqoopUsersGuide-ImportingIndividualTables"></a>Importing Individual Tables</h2>
|
||||
|
||||
<p>In addition to full-database imports, Sqoop will allow you to import individual tables. Instead of using <tt>--all-tables</tt>, specify the name of a particular table with the <tt>--table</tt> argument:</p>
|
||||
|
||||
<div class="code panel" style="border-width: 1px;"><div class="codeContent panelContent">
|
||||
<pre class="code-java">
|
||||
$ hadoop jar /path/to/sqoop.jar org.apache.hadoop.sqoop.Sqoop --connect jdbc:mysql:<span class="code-comment">//database.example.com/employees --table employee_names </span>
|
||||
</pre>
|
||||
</div></div>
|
||||
|
||||
<p>You can further specify a subset of the columns in a table by using the <tt>--columns</tt> argument. This takes a list of column names, delimited by commas, with no spaces in between. e.g.:</p>
|
||||
|
||||
<div class="code panel" style="border-width: 1px;"><div class="codeContent panelContent">
|
||||
<pre class="code-java">
|
||||
|
||||
$ hadoop jar /path/to/sqoop.jar org.apache.hadoop.sqoop.Sqoop --connect jdbc:mysql:<span class="code-comment">//database.example.com/employees --table employee_names --columns employee_id,first_name,last_name,dept_id</span>
|
||||
</pre>
|
||||
</div></div>
|
||||
|
||||
<p>Sqoop will use a MapReduce job to read sections of the table in parallel. For the MapReduce tasks to divide the table space, the results returned by the database must be orderable. Sqoop will automatically detect the primary key for a table and use that to order the results. If no primary key is available, or (less likely) you want to order the results along a different column, you can specify the column name with <tt>--order-by</tt>. <b>Important:</b> To guarantee correctness of your input, you must select an ordering column for which each row has a unique value. If duplicate values appear in the ordering column, the results of the import are undefined, and Sqoop will not be able to detect the error.</p>
|
||||
|
||||
<p>The <tt>--columns</tt> and <tt>--order-by</tt> arguments are incompatible with <tt>--all-tables</tt>. If you require special handling for some of the tables, then you must manually run a separate import job for each table.</p>
|
||||
|
||||
<h2><a name="SqoopUsersGuide-MiscellaneousAdditionalArguments"></a>Miscellaneous Additional Arguments</h2>
|
||||
|
||||
<p>If you want to generate the Java classes to represent tables without actually performing an import, supply a connect string and (optionally) credentials as above, as well as <tt>--all-tables</tt> or <tt>--table</tt>, but also use the <b><tt>--generate-only</tt></b> argument. This will generate the classes and cease further operation.</p>
|
||||
|
||||
<p>You can override the <tt>$HADOOP_HOME</tt> environment variable within Sqoop with the <tt>--hadoop-home</tt> argument. </p>
|
||||
|
||||
</body></html>
|
15
readme.txt
Normal file
15
readme.txt
Normal file
@ -0,0 +1,15 @@
|
||||
Sqoop documentation is in the doc/ directory in asciidoc format.
|
||||
|
||||
Run 'ant doc' to build the documentation. It will be created in
|
||||
$HADOOP_HOME/build/contrib/sqoop/doc.
|
||||
|
||||
There will be a manpage (sqoop.1.gz) and a User Guide formatted in HTML.
|
||||
|
||||
This process requires the following programs:
|
||||
asciidoc
|
||||
gzip
|
||||
make
|
||||
python 2.5+
|
||||
xmlto
|
||||
|
||||
For more information about asciidoc, see http://www.methods.co.nz/asciidoc/
|
Loading…
Reference in New Issue
Block a user