diff --git a/src/docs/Makefile b/src/docs/Makefile index 587c051d..5cf3b090 100644 --- a/src/docs/Makefile +++ b/src/docs/Makefile @@ -16,28 +16,40 @@ BUILDROOT=../../build BUILD_DIR=$(BUILDROOT)/docs -all: man userguide +all: man userguide devguide man: $(BUILD_DIR)/sqoop.1.gz userguide: $(BUILD_DIR)/SqoopUserGuide.html -$(BUILD_DIR)/sqoop.1.gz: Sqoop-manpage.txt *formatting*.txt - asciidoc -b docbook -d manpage Sqoop-manpage.txt +devguide: $(BUILD_DIR)/SqoopDevGuide.html + +$(BUILD_DIR)/sqoop.1.gz: user/Sqoop-manpage.txt user/*formatting*.txt + asciidoc -b docbook -d manpage user/Sqoop-manpage.txt xmlto man Sqoop-manpage.xml gzip sqoop.1 rm Sqoop-manpage.xml mkdir -p $(BUILD_DIR) mv sqoop.1.gz $(BUILD_DIR) -$(BUILD_DIR)/SqoopUserGuide.html: SqoopUserGuide.txt *.txt - asciidoc SqoopUserGuide.txt +$(BUILD_DIR)/SqoopUserGuide.html: user/*.txt + asciidoc -a toc -a toclevels=1 -a "toc-title=Table of Contents" \ + user/SqoopUserGuide.txt mkdir -p $(BUILD_DIR) - mv SqoopUserGuide.html $(BUILD_DIR) + mv user/SqoopUserGuide.html $(BUILD_DIR) + +$(BUILD_DIR)/SqoopDevGuide.html: dev/*.txt + asciidoc -a toc -a toclevels=1 -a "toc-title=Table of Contents" \ + dev/SqoopDevGuide.txt + mkdir -p $(BUILD_DIR) + mv dev/SqoopDevGuide.html $(BUILD_DIR) clean: -rm $(BUILD_DIR)/sqoop.1.gz -rm $(BUILD_DIR)/SqoopUserGuide.html + -rm $(BUILD_DIR)/SqoopUserGuide.pdf + -rm user/SqoopUserGuide.html + -rm dev/SqoopDevGuide.html -.PHONY: all man userguide clean +.PHONY: all man userguide devguide clean diff --git a/src/docs/SqoopUserGuide.txt b/src/docs/SqoopUserGuide.txt deleted file mode 100644 index 0b00f452..00000000 --- a/src/docs/SqoopUserGuide.txt +++ /dev/null @@ -1,67 +0,0 @@ - -//// - Licensed to Cloudera, Inc. under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -//// - - -include::intro.txt[] - - -The Sqoop Command Line ----------------------- - -To execute Sqoop, run with Hadoop: ----- -$ bin/hadoop jar contrib/sqoop/hadoop-$(version)-sqoop.jar (arguments) ----- - -NOTE:Throughput this document, we will use `sqoop` as shorthand for the -above. i.e., `$ sqoop (arguments)` - -You pass this program options describing the -import job you want to perform. If you need a hint, running Sqoop with -`--help` will print out a list of all the command line -options available. The +sqoop(1)+ manual page will also describe -Sqoop's available arguments in greater detail. The manual page is built -in `$HADOOP_HOME/build/contrib/sqoop/doc/sqoop.1.gz`. -The following subsections will describe the most common modes of operation. - -include::connecting.txt[] - -include::listing-dbs.txt[] - -include::listing-tables.txt[] - -include::full-db-import.txt[] - -include::table-import.txt[] - -include::controlling-output-format.txt[] - -include::classnames.txt[] - -include::misc-args.txt[] - -include::direct.txt[] - -include::hive.txt[] - -include::export.txt[] - -include::supported-dbs.txt[] - -include::api-reference.txt[] - diff --git a/src/docs/listing-dbs.txt b/src/docs/dev/SqoopDevGuide.txt similarity index 58% rename from src/docs/listing-dbs.txt rename to src/docs/dev/SqoopDevGuide.txt index a2eaa596..4a6f3ed6 100644 --- a/src/docs/listing-dbs.txt +++ b/src/docs/dev/SqoopDevGuide.txt @@ -16,20 +16,13 @@ limitations under the License. //// +include::intro.txt[] -Listing Available Databases -~~~~~~~~~~~~~~~~~~~~~~~~~~~ +include::preface.txt[] + +include::compiling.txt[] + +include::api-reference.txt[] -Once connected to a database server, you can list the available -databases with the +--list-databases+ parameter. This currently is supported -only by HSQLDB and MySQL. Note that in this case, the connect string does -not include a database name, just a server address. ----- -$ sqoop --connect jdbc:mysql://database.example.com/ --list-databases -information_schema -employees ----- -_This only works with HSQLDB and MySQL. A vendor-agnostic implementation of -this function has not yet been implemented._ diff --git a/src/docs/api-reference.txt b/src/docs/dev/api-reference.txt similarity index 53% rename from src/docs/api-reference.txt rename to src/docs/dev/api-reference.txt index 332b7104..aef9a1c9 100644 --- a/src/docs/api-reference.txt +++ b/src/docs/dev/api-reference.txt @@ -19,29 +19,38 @@ Developer API Reference ----------------------- -This section is intended to specify the APIs available to application writers -integrating with Sqoop, and those modifying Sqoop. The next three subsections -are written from the following three perspectives: those using classes generated -by Sqoop, and its public library; those writing Sqoop extensions (i.e., -additional ConnManager implementations that interact with more databases); and -those modifying Sqoop's internals. Each section describes the system in -successively greater depth. +This section specifies the APIs available to application writers who +want to integrate with Sqoop, and those who want to modify Sqoop. + +The next three subsections are written for the following use cases: + +- Using classes generated by Sqoop and its public library +- Writing Sqoop extensions (that is, additional ConnManager implementations + that interact with more databases) +- Modifying Sqoop's internals + +Each section describes the system in successively greater depth. The External API ~~~~~~~~~~~~~~~~ -Sqoop auto-generates classes that represent the tables imported into HDFS. The -class contains member fields for each column of the imported table; an instance -of the class holds one row of the table. The generated classes implement the -serialization APIs used in Hadoop, namely the _Writable_ and _DBWritable_ -interfaces. They also contain other convenience methods: a +parse()+ method -that interprets delimited text fields, and a +toString()+ method that preserves -the user's chosen delimiters. The full set of methods guaranteed to exist in an -auto-generated class are specified in the interface +Sqoop automatically generates classes that represent the tables +imported into the Hadoop Distributed File System (HDFS). The class +contains member fields for each column of the imported table; an +instance of the class holds one row of the table. The generated +classes implement the serialization APIs used in Hadoop, namely the +_Writable_ and _DBWritable_ interfaces. They also contain these other +convenience methods: + +- A parse() method that interprets delimited text fields +- A toString() method that preserves the user's chosen delimiters + +The full set of methods guaranteed to exist in an auto-generated class +is specified in the abstract class +org.apache.hadoop.sqoop.lib.SqoopRecord+. -Instances of _SqoopRecord_ may depend on Sqoop's public API. This is all classes +Instances of +SqoopRecord+ may depend on Sqoop's public API. This is all classes in the +org.apache.hadoop.sqoop.lib+ package. These are briefly described below. Clients of Sqoop should not need to directly interact with any of these classes, although classes generated by Sqoop will depend on them. Therefore, these APIs @@ -57,16 +66,21 @@ are considered public and care will be taken when forward-evolving them. * +BigDecimalSerializer+ contains a pair of methods that facilitate serialization of +BigDecimal+ objects over the _Writable_ interface. +The full specification of the public API is available on the Sqoop +Development Wiki as +http://wiki.github.com/cloudera/sqoop/sip-4[SIP-4]. + + The Extension API ~~~~~~~~~~~~~~~~~ This section covers the API and primary classes used by extensions for Sqoop which allow Sqoop to interface with more database vendors. -While Sqoop uses JDBC and +DBInputFormat+ (and +DataDrivenDBInputFormat+) to +While Sqoop uses JDBC and +DataDrivenDBInputFormat+ to read from databases, differences in the SQL supported by different vendors as well as JDBC metadata necessitates vendor-specific codepaths for most databases. -Sqoop's solution to this problem is by introducing the ConnManager API +Sqoop's solution to this problem is by introducing the +ConnManager+ API (+org.apache.hadoop.sqoop.manager.ConnMananger+). +ConnManager+ is an abstract class defining all methods that interact with the @@ -80,40 +94,46 @@ selectively override behavior. For example, the +getColNamesQuery()+ method allows the SQL query used by +getColNames()+ to be modified without needing to rewrite the majority of +getColNames()+. -+ConnManager+ implementations receive a lot of their configuration data from a -Sqoop-specific class, +SqoopOptions+. While +SqoopOptions+ does not currently -contain many setter methods, clients should not assume +SqoopOptions+ are -immutable. More setter methods may be added in the future. +SqoopOptions+ does -not directly store specific per-manager options. Instead, it contains a -reference to the +Configuration+ returned by +Tool.getConf()+ after parsing -command-line arguments with the +GenericOptionsParser+. This allows extension -arguments via "+-D any.specific.param=any.value+" without requiring any layering -of options parsing or modification of +SqoopOptions+. ++ConnManager+ implementations receive a lot of their configuration +data from a Sqoop-specific class, +SqoopOptions+. +SqoopOptions+ are +mutable. +SqoopOptions+ does not directly store specific per-manager +options. Instead, it contains a reference to the +Configuration+ +returned by +Tool.getConf()+ after parsing command-line arguments with +the +GenericOptionsParser+. This allows extension arguments via "+-D +any.specific.param=any.value+" without requiring any layering of +options parsing or modification of +SqoopOptions+. This ++Configuration+ forms the basis of the +Configuration+ passed to any +MapReduce +Job+ invoked in the workflow, so that users can set on the +command-line any necessary custom Hadoop state. -All existing +ConnManager+ implementations are stateless. Thus, the system which -instantiates +ConnManagers+ may implement multiple instances of the same -+ConnMananger+ class over Sqoop's lifetime. If a caching layer is required, we -can add one later, but it is not currently available. +All existing +ConnManager+ implementations are stateless. Thus, the +system which instantiates +ConnManagers+ may implement multiple +instances of the same +ConnMananger+ class over Sqoop's lifetime. It +is currently assumed that instantiating a +ConnManager+ is a +lightweight operation, and is done reasonably infrequently. Therefore, ++ConnManagers+ are not cached between operations, etc. -+ConnManagers+ are currently created by instances of the abstract class +ManagerFactory+ (See -MAPREDUCE-750). One +ManagerFactory+ implementation currently serves all of -Sqoop: +org.apache.hadoop.sqoop.manager.DefaultManagerFactory+. Extensions -should not modify +DefaultManagerFactory+. Instead, an extension-specific -+ManagerFactory+ implementation should be provided with the new ConnManager. -+ManagerFactory+ has a single method of note, named +accept()+. This method will -determine whether it can instantiate a +ConnManager+ for the user's -+SqoopOptions+. If so, it returns the +ConnManager+ instance. Otherwise, it -returns +null+. ++ConnManagers+ are currently created by instances of the abstract +class +ManagerFactory+ (See +http://issues.apache.org/jira/browse/MAPREDUCE-750[]). One ++ManagerFactory+ implementation currently serves all of Sqoop: ++org.apache.hadoop.sqoop.manager.DefaultManagerFactory+. Extensions +should not modify +DefaultManagerFactory+. Instead, an +extension-specific +ManagerFactory+ implementation should be provided +with the new +ConnManager+. +ManagerFactory+ has a single method of +note, named +accept()+. This method will determine whether it can +instantiate a +ConnManager+ for the user's +SqoopOptions+. If so, it +returns the +ConnManager+ instance. Otherwise, it returns +null+. The +ManagerFactory+ implementations used are governed by the -+sqoop.connection.factories+ setting in sqoop-site.xml. Users of extension ++sqoop.connection.factories+ setting in +sqoop-site.xml+. Users of extension libraries can install the 3rd-party library containing a new +ManagerFactory+ -and +ConnManager+(s), and configure sqoop-site.xml to use the new +and +ConnManager+(s), and configure +sqoop-site.xml+ to use the new +ManagerFactory+. The +DefaultManagerFactory+ principly discriminates between databases by parsing the connect string stored in +SqoopOptions+. Extension authors may make use of classes in the +org.apache.hadoop.sqoop.io+, -+mapred+, +mapreduce+, and +util+ packages to facilitate their implementations. ++mapreduce+, and +util+ packages to facilitate their implementations. These packages and classes are described in more detail in the following section. @@ -134,35 +154,43 @@ General program flow The general program flow is as follows: +org.apache.hadoop.sqoop.Sqoop+ is the main class and implements _Tool_. A new -instance is launched with +ToolRunner+. It parses its arguments using the -+SqoopOptions+ class. Within the +SqoopOptions+, an +ImportAction+ will be -chosen by the user. This may be import all tables, import one specific table, -execute a SQL statement, or others. +instance is launched with +ToolRunner+. The first argument to Sqoop is +a string identifying the name of a +SqoopTool+ to run. The +SqoopTool+ +itself drives the execution of the user's requested operation (e.g., +import, export, codegen, etc). -A +ConnManager+ is then instantiated based on the data in the +SqoopOptions+. -The +ConnFactory+ is used to get a +ConnManager+ from a +ManagerFactory+; the -mechanics of this were described in an earlier section. +The +SqoopTool+ API is specified fully in +http://wiki.github.com/cloudera/sqoop/sip-1[SIP-1]. -Then in the +run()+ method, using a case statement, it determines which actions -the user needs performed based on the +ImportAction+ enum. Usually this involves -determining a list of tables to import, generating user code for them, and -running a MapReduce job per table to read the data. The import itself does not -specifically need to be run via a MapReduce job; the +ConnManager.importTable()+ -method is left to determine how best to run the import. Each of these actions is -controlled by the +ConnMananger+, except for the generating of code, which is -done by the +CompilationManager+ and +ClassWriter+. (Both in the -+org.apache.hadoop.sqoop.orm+ package.) Importing into Hive is also taken care -of via the +org.apache.hadoop.sqoop.hive.HiveImport+ class after the -+importTable()+ has completed. This is done without concern for the -+ConnManager+ implementation used. +The chosen +SqoopTool+ will parse the remainder of the arguments, +setting the appropriate fields in the +SqoopOptions+ class. It will +then run its body. -A ConnManager's +importTable()+ method receives a single argument of type -+ImportJobContext+ which contains parameters to the method. This class may be -extended with additional parameters in the future, which optionally further -direct the import operation. Similarly, the +exportTable()+ method receives an -argument of type +ExportJobContext+. These classes contain the name of the table -to import/export, a reference to the +SqoopOptions+ object, and other related -data. +Then in the SqoopTool's +run()+ method, the import or export or other +action proper is executed. Typically, a +ConnManager+ is then +instantiated based on the data in the +SqoopOptions+. The ++ConnFactory+ is used to get a +ConnManager+ from a +ManagerFactory+; +the mechanics of this were described in an earlier section. Imports +and exports and other large data motion tasks typically run a +MapReduce job to operate on a table in a parallel, reliable fashion. +An import does not specifically need to be run via a MapReduce job; +the +ConnManager.importTable()+ method is left to determine how best +to run the import. Each main action is actually controlled by the ++ConnMananger+, except for the generating of code, which is done by +the +CompilationManager+ and +ClassWriter+. (Both in the ++org.apache.hadoop.sqoop.orm+ package.) Importing into Hive is also +taken care of via the +org.apache.hadoop.sqoop.hive.HiveImport+ class +after the +importTable()+ has completed. This is done without concern +for the +ConnManager+ implementation used. + +A ConnManager's +importTable()+ method receives a single argument of +type +ImportJobContext+ which contains parameters to the method. This +class may be extended with additional parameters in the future, which +optionally further direct the import operation. Similarly, the ++exportTable()+ method receives an argument of type ++ExportJobContext+. These classes contain the name of the table to +import/export, a reference to the +SqoopOptions+ object, and other +related data. Subpackages ^^^^^^^^^^^ @@ -175,8 +203,9 @@ The following subpackages under +org.apache.hadoop.sqoop+ exist: * +lib+ - The external public API (described earlier). * +manager+ - The +ConnManager+ and +ManagerFactory+ interface and their implementations. -* +mapreduce+ - Classes interfacing with the new (0.20+) MapReduce API.... +* +mapreduce+ - Classes interfacing with the new (0.20+) MapReduce API. * +orm+ - Code auto-generation. +* +tool+ - Implementations of +SqoopTool+. * +util+ - Miscellaneous utility classes. The +io+ package contains _OutputStream_ and _BufferedWriter_ implementations @@ -185,11 +214,13 @@ BufferedWriter to be opened to a client which will, under the hood, write to multiple files in series as they reach a target threshold size. This allows unsplittable compression libraries (e.g., gzip) to be used in conjunction with Sqoop import while still allowing subsequent MapReduce jobs to use multiple -input splits per dataset. +input splits per dataset. The large object file storage (see +http://wiki.github.com/cloudera/sqoop/sip-3[SIP-3]) system's code +lies in the +io+ package as well. -The +mapreduce+ package contains +DataDrivenImportJob+, which uses the -+DataDrivenDBInputFormat+ introduced in 0.21. Most +ConnManager+ -implementations use +DataDrivenImportJob+ to perform their imports. +The +mapreduce+ package contains code that interfaces directly with +Hadoop MapReduce. This package's contents are described in more detail +in the next section. The +orm+ package contains code used for class generation. It depends on the JDK's tools.jar which provides the com.sun.tools.javac package. @@ -237,3 +268,29 @@ and forward the data along to HDFS, possibly performing formatting conversions in the meantime. +Interfacing with MapReduce +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Sqoop schedules MapReduce jobs to effect imports and exports. +Configuration and execution of MapReduce jobs follows a few common +steps (configuring the +InputFormat+; configuring the +OutputFormat+; +setting the +Mapper+ implementation; etc...). These steps are +formalized in the +org.apache.hadoop.sqoop.mapreduce.JobBase+ class. +The +JobBase+ allows a user to specify the +InputFormat+, ++OutputFormat+, and +Mapper+ to use. + ++JobBase+ itself is subclassed by +ImportJobBase+ and +ExportJobBase+ +which offer better support for the particular configuration steps +common to import or export-related jobs, respectively. ++ImportJobBase.runImport()+ will call the configuration steps and run +a job to import a table to HDFS. + +Subclasses of these base classes exist as well. For example, ++DataDrivenImportJob+ uses the +DataDrivenDBInputFormat+ to run an +import. This is the most common type of import used by the various ++ConnManager+ implementations available. MySQL uses a different class +(+MySQLDumpImportJob+) to run a direct-mode import. Its custom ++Mapper+ and +InputFormat+ implementations reside in this package as +well. + + diff --git a/src/docs/dev/compiling.txt b/src/docs/dev/compiling.txt new file mode 100644 index 00000000..f8bdd9c7 --- /dev/null +++ b/src/docs/dev/compiling.txt @@ -0,0 +1,32 @@ + +//// + Licensed to Cloudera, Inc. under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +//// + + +Compiling Sqoop from Source +--------------------------- + +You can obtain the source code for Sqoop at: +http://github.com/cloudera/sqoop + +Sqoop source code is held in a +git+ repository. Instructions for +retrieving source from the repository are provided at: +http://wiki.github.com/cloudera/sqoop/DevelopmentProcess + +Compilation instructions are provided in the +COMPILING.txt+ file in +the root of the source repository. + diff --git a/src/docs/intro.txt b/src/docs/dev/intro.txt similarity index 55% rename from src/docs/intro.txt rename to src/docs/dev/intro.txt index 79322b82..3fbc8345 100644 --- a/src/docs/intro.txt +++ b/src/docs/dev/intro.txt @@ -20,15 +20,9 @@ Introduction ------------ -Sqoop is a tool designed to help users of large data import -existing relational databases into their Hadoop clusters. Sqoop uses -JDBC to connect to a database, examine each table's schema, and -auto-generate the necessary classes to import data into HDFS. It -then instantiates a MapReduce job to read tables from the database -via the DBInputFormat (JDBC-based InputFormat). Tables are read -into a set of files loaded into HDFS. Both SequenceFile and -text-based targets are supported. Sqoop also supports high-performance -imports from select databases including MySQL. +If you are a developer or an application programmer who intends to +modify Sqoop or build an extension using one of Sqoop's internal APIs, +you should read this document. The following sections describe the +purpose of each API, where internal APIs are used, and which APIs are +necessary for implementing support for additional databases. -This document describes how to get started using Sqoop to import -your data into Hadoop. diff --git a/src/docs/dev/preface.txt b/src/docs/dev/preface.txt new file mode 100644 index 00000000..ce9fa705 --- /dev/null +++ b/src/docs/dev/preface.txt @@ -0,0 +1,55 @@ + +//// + Licensed to Cloudera, Inc. under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +//// + + + +Supported Releases +------------------ + +This documentation applies to Sqoop v1.0.0 (June 2010). + +Sqoop Releases +-------------- + +Sqoop is an open source software product of Cloudera, Inc. Software +development for Sqoop occurs at http://github.com/cloudera/sqoop. At +that site, you can obtain: + +- New releases of Sqoop as well as its most recent source code +- An issue tracker +- A wiki that contains Sqoop documentation + + +Prerequisites +------------- + +The following prerequisite knowledge is required for Sqoop: + +- Software development in Java +* Familiarity with JDBC +* Familiarity with Hadoop's APIs (including the "new" MapReduce API of + 0.20+) +- Relational database management systems and SQL + +This document assumes you are using a Linux or Linux-like environment. +If you are using Windows, you may be able to use cygwin to accomplish +most of the following tasks. If you are using Mac OS X, you should see +few (if any) compatibility errors. Sqoop is predominantly operated and +tested on Linux. + + diff --git a/src/docs/direct.txt b/src/docs/direct.txt deleted file mode 100644 index b0eb22b7..00000000 --- a/src/docs/direct.txt +++ /dev/null @@ -1,77 +0,0 @@ - -//// - Licensed to Cloudera, Inc. under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -//// - - -Direct-mode Imports -------------------- - -While the JDBC-based import method used by Sqoop provides it with the -ability to read from a variety of databases using a generic driver, it -is not the most high-performance method available. Sqoop can read from -certain database systems faster by using their built-in export tools. - -For example, Sqoop can read from a MySQL database by using the +mysqldump+ -tool distributed with MySQL. You can take advantage of this faster -import method by running Sqoop with the +--direct+ argument. This -combined with a connect string that begins with +jdbc:mysql://+ will -inform Sqoop that it should select the faster access method. - -If your delimiters exactly match the delimiters used by +mysqldump+, -then Sqoop will use a fast-path that copies the data directly from -+mysqldump+'s output into HDFS. Otherwise, Sqoop will parse +mysqldump+'s -output into fields and transcode them into the user-specified delimiter set. -This incurs additional processing, so performance may suffer. -For convenience, the +--mysql-delimiters+ -argument will set all the output delimiters to be consistent with -+mysqldump+'s format. - -Sqoop also provides a direct-mode backend for PostgreSQL that uses the -+COPY TO STDOUT+ protocol from +psql+. No specific delimiter set provides -better performance; Sqoop will forward delimiter control arguments to -+psql+. - -The "Supported Databases" section provides a full list of database vendors -which have direct-mode support from Sqoop. - -When writing to HDFS, direct mode will open a single output file to receive -the results of the import. You can instruct Sqoop to use multiple output -files by using the +--direct-split-size+ argument which takes a size in -bytes. Sqoop will generate files of approximately this size. e.g., -+--direct-split-size 1000000+ will generate files of approximately 1 MB -each. If compressing the HDFS files with +--compress+, this will allow -subsequent MapReduce programs to use multiple mappers across your data -in parallel. - -Tool-specific arguments -~~~~~~~~~~~~~~~~~~~~~~~ - -Sqoop will generate a set of command-line arguments with which it invokes -the underlying direct-mode tool (e.g., mysqldump). You can specify additional -arguments which should be passed to the tool by passing them to Sqoop -after a single '+-+' argument. e.g.: - ----- -$ sqoop --connect jdbc:mysql://localhost/db --table foo --direct - --lock-tables ----- - -The +--lock-tables+ argument (and anything else to the right of the +-+ argument) -will be passed directly to mysqldump. - - - - diff --git a/src/docs/export.txt b/src/docs/export.txt deleted file mode 100644 index 6a792562..00000000 --- a/src/docs/export.txt +++ /dev/null @@ -1,76 +0,0 @@ - -//// - Licensed to Cloudera, Inc. under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -//// - - -Exporting to a Database ------------------------ - -In addition to importing database tables into HDFS, Sqoop can also -work in "reverse," reading the contents of a file or directory in -HDFS, interpreting the data as database rows, and inserting them -into a specified database table. - -To run an export, invoke Sqoop with the +--export-dir+ and -+--table+ options. e.g.: - ----- -$ sqoop --connect jdbc:mysql://db.example.com/foo --table bar \ - --export-dir /results/bar_data ----- - -This will take the files in +/results/bar_data+ and inject their -contents in to the +bar+ table in the +foo+ database on +db.example.com+. -The target table must already exist in the database. Sqoop will perform -a set of +INSERT INTO+ operations, without regard for existing content. If -Sqoop attempts to insert rows which violate constraints in the database -(e.g., a particular primary key value already exists), then the export -will fail. - -As in import mode, Sqoop will auto-generate an interoperability class -to use with the particular table in question. This will be used to parse -the records in HDFS files before loading their contents into the database. -You must specify the same delimiters (e.g., with +--fields-terminated-by+, -etc.) as are used in the files to export in order to parse the data -correctly. If your data is stored in SequenceFiles (created with an import -in the +--as-sequencefile+ format), then you do not need to specify -delimiters. - -If you have an existing auto-generated jar and class that you intend to use -with Sqoop, you can specify these with the +--jar-file+ and +--class-name+ -parameters. Providing these options will disable autogeneration of a new -class based on the target table. - - -Exporting to MySQL -~~~~~~~~~~~~~~~~~~ - -MySQL supports a direct mode for exports. If the +--direct+ argument is given -when exporting to a MySQL database, Sqoop will use instances of +mysqlimport+ -to manage the export process. - -For performance, each writer will commit approximately every 32 MB of exported -data. This can be controlled by passing the following argument _before_ any -named parameters: +-D sqoop.mysql.export.checkpoint.bytes=_size_+, where _size_ -is a value in bytes. Setting _size_ to 0 will disable intermediate checkpoints, -although individual files being exported will continue to be committed -independently of one another. - -IMPORTANT: Note that any arguments to Sqoop that are of the form -+-D parameter=value+ must appear before any named arguments (e.g., +--connect+, -+--table+, etc). - diff --git a/src/docs/full-db-import.txt b/src/docs/full-db-import.txt deleted file mode 100644 index 4a9477c3..00000000 --- a/src/docs/full-db-import.txt +++ /dev/null @@ -1,92 +0,0 @@ - -//// - Licensed to Cloudera, Inc. under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -//// - - -Automatic Full-database Import -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -If you want to import all the tables in a database, you can use the -+--all-tables+ command to do so: - ----- -$ sqoop --connect jdbc:mysql://database.example.com/employees --all-tables ----- - -This will query the database for the available tables, generate an ORM -class for each table, and run a MapReduce job to import each one. -Hadoop uses the DBInputFormat to read from a database into a Mapper -instance. To read a table into a MapReduce program requires creating a -class to hold the fields of one row of the table. One of the benefits -of Sqoop is that it generates this class definition for you, based on -the table definition in the database. - -The generated +.java+ files are, by default, placed in the current -directory. You can supply a different directory with the +--outdir+ -parameter. These are then compiled into +.class+ and +.jar+ files for use -by the MapReduce job that it launches. These files are created in a -temporary directory. You can redirect this target with +--bindir+. - -Each table will be imported into a separate directory in HDFS, with -the same name as the table. For instance, if my Hadoop username is -aaron, the above command would have generated the following -directories in HDFS: - ----- -/user/aaron/employee_names -/user/aaron/payroll_checks -/user/aaron/job_descriptions -/user/aaron/office_supplies ----- - -You can change the base directory under which the tables are loaded -with the +--warehouse-dir+ parameter. For example: - ----- -$ sqoop --connect jdbc:mysql://database.example.com/employees --all-tables \ - --warehouse-dir /common/warehouse ----- - -This would create the following directories instead: - ----- -/common/warehouse/employee_names -/common/warehouse/payroll_checks -/common/warehouse/job_descriptions -/common/warehouse/office_supplies ----- - -By default the data will be read into text files in HDFS. Each of the -columns will be represented as comma-delimited text. Each row is -terminated by a newline. See the section on "Controlling the Output -Format" below for information on how to change these delimiters. - -If you want to leverage compression and binary file formats, the -+--as-sequencefile+ argument to Sqoop will import the table -to a set of SequenceFiles instead. This stores each field of each -database record in a separate object in a SequenceFile. -This representation is also likely to be higher performance when used -as an input to subsequent MapReduce programs as it does not require -parsing. For completeness, Sqoop provides an +--as-textfile+ option, which is -implied by default. An +--as-textfile+ on the command-line will override -a previous +--as-sequencefile+ argument. - -The SequenceFile format will embed the records from the database as -objects using the code generated by Sqoop. It is important that you -retain the +.java+ file for this class, as you will need to be able to -instantiate the same type to read the objects back later, in other -user-defined applications. - diff --git a/src/docs/hive.txt b/src/docs/hive.txt deleted file mode 100644 index cd9f236e..00000000 --- a/src/docs/hive.txt +++ /dev/null @@ -1,72 +0,0 @@ - -//// - Licensed to Cloudera, Inc. under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -//// - - -Importing Data Into Hive ------------------------- - -Sqoop's primary function is to upload your data into files in HDFS. If -you have a Hive metastore associated with your HDFS cluster, Sqoop can -also import the data into Hive by generating and executing a +CREATE -TABLE+ statement to define the data's layout in Hive. Importing data -into Hive is as simple as adding the *+--hive-import+* option to your -Sqoop command line. - -By default the data is imported into HDFS, but you can skip this operation -by using the *+--hive-create+* option. Optionally, you can specify the -*+--hive-overwrite+* option to indicate that existing table in hive must -be replaced. After your data is imported into HDFS or this step is -omitted, Sqoop will generate a Hive script containing a +CREATE TABLE+ -operation defining your columns using Hive's types, and a +LOAD DATA INPATH+ -statement to move the data files into Hive's warehouse directory if -*+--hive-create+* option is not added. The script will be executed by calling -the installed copy of hive on the machine where Sqoop is run. If you have -multiple Hive installations, or +hive+ is not in your +$PATH+ use the -*+--hive-home+* option to identify the Hive installation directory. -Sqoop will use +$HIVE_HOME/bin/hive+ from here. - -NOTE: This function is incompatible with +--as-sequencefile+. - -Hive's text parser does not know how to support escaping or enclosing -characters. Sqoop will print a warning if you use +--escaped-by+, -+--enclosed-by+, or +--optionally-enclosed-by+ since Hive does not know -how to parse these. It will pass the field and record terminators through -to Hive. If you do not set any delimiters and do use +--hive-import+, -the field delimiter will be set to +^A+ and the record delimiter will -be set to +\n+ to be consistent with Hive's defaults. - -The table name used in Hive is, by default, the same as that of the -source table. You can control the output table name with the +--hive-table+ -option. - -If Hive import commands are used in conjunction with the +--generate-only+ -option, then a Hive import will not occur. Instead, the DDL commands to -perform the import from HDFS to Hive are written to a file named +_tableName_.q+ -which you can then execute with +hive -f+ after the data is brought into -HDFS. - -Hive's Type System -~~~~~~~~~~~~~~~~~~ - -Hive users will note that there is not a one-to-one mapping between -SQL types and Hive types. In general, SQL types that do not have a -direct mapping (e.g., +DATE+, +TIME+, and +TIMESTAMP+) will be coerced to -+STRING+ in Hive. The +NUMERIC+ and +DECIMAL+ SQL types will be coerced to -+DOUBLE+. In these cases, Sqoop will emit a warning in its log messages -informing you of the loss of precision. - diff --git a/src/docs/misc-args.txt b/src/docs/misc-args.txt deleted file mode 100644 index d8f86840..00000000 --- a/src/docs/misc-args.txt +++ /dev/null @@ -1,48 +0,0 @@ - -//// - Licensed to Cloudera, Inc. under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -//// - - -Miscellaneous Additional Arguments -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -If you want to generate the Java classes to represent tables without -actually performing an import, supply a connect string and -(optionally) credentials as above, as well as +--all-tables+ or -+--table+, but also use the +--generate-only+ argument. This will -generate the classes and cease further operation. - -You can override the +$HADOOP_HOME+ environment variable within Sqoop -with the +--hadoop-home+ argument. You can override the +$HIVE_HOME+ -environment variable with +--hive-home+. - -Data emitted to HDFS is by default uncompressed. You can instruct -Sqoop to use gzip to compress your data by providing either the -+--compress+ or +-z+ argument (both are equivalent). - -Small CLOB and BLOB values will be imported as string-based data inline -with the rest of their containing record. Over a size threshold (by -default, 16 MB per object), these values will not be materialized directly, -inline, and will be written to external files in HDFS; the inline records -will contain pointers to these files. The inline materialization limit can -be controlled with the +--inline-lob-limit+ argument; the limit itself is -specified in bytes. - -Using +--verbose+ will instruct Sqoop to print more details about its -operation; this is particularly handy if Sqoop appears to be misbehaving. - - diff --git a/src/docs/supported-dbs.txt b/src/docs/supported-dbs.txt deleted file mode 100644 index 6fbe8084..00000000 --- a/src/docs/supported-dbs.txt +++ /dev/null @@ -1,55 +0,0 @@ - -//// - Licensed to Cloudera, Inc. under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -//// - - -Supported Databases -------------------- - -Sqoop uses JDBC to connect to databases. JDBC is a compatibility layer -that allows a program to access many different databases through a common -API. Slight differences in the SQL language spoken by each database, however, -may mean that Sqoop can't use every database out of the box, or that some -databases may be used in an inefficient manner. - -When you provide a connect string to Sqoop, it inspects the protocol scheme to -determine appropriate vendor-specific logic to use. If Sqoop knows about -a given database, it will work automatically. If not, you may need to -specify the driver class to load via +--driver+. This will use a generic -code path which will use standard SQL to access the database. Sqoop provides -some databases with faster, non-JDBC-based access mechanisms. These can be -enabled by specfying the +--direct+ parameter. - -Sqoop includes vendor-specific code paths for the following databases: - -[grid="all"] -`-----------`--------`--------------------`--------------------- -Database version +--direct+ support? connect string matches ----------------------------------------------------------------- -HSQLDB 1.8.0+ No +jdbc:hsqldb:*//+ -MySQL 5.0+ Yes +jdbc:mysql://+ -Oracle 10.2.0+ No +jdbc:oracle:*//+ -PostgreSQL 8.3+ Yes (import only) +jdbc:postgresql://+ ----------------------------------------------------------------- - -Sqoop may work with older versions of the databases listed, but we have -only tested it with the versions specified above. - -Even if Sqoop supports a database internally, you may still need to -install the database vendor's JDBC driver in your +$HADOOP_HOME/lib+ -path. - diff --git a/src/docs/table-import.txt b/src/docs/table-import.txt deleted file mode 100644 index 8a77d4f2..00000000 --- a/src/docs/table-import.txt +++ /dev/null @@ -1,68 +0,0 @@ - -//// - Licensed to Cloudera, Inc. under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -//// - - -Importing Individual Tables -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -In addition to full-database imports, Sqoop will allow you to import -individual tables. Instead of using +--all-tables+, specify the name of -a particular table with the +--table+ argument: - ----- -$ sqoop --connect jdbc:mysql://database.example.com/employees \ - --table employee_names ----- - -You can further specify a subset of the columns in a table by using -the +--columns+ argument. This takes a list of column names, delimited -by commas, with no spaces in between. e.g.: - ----- -$ sqoop --connect jdbc:mysql://database.example.com/employees \ - --table employee_names --columns employee_id,first_name,last_name,dept_id ----- - -Sqoop will use a MapReduce job to read sections of the table in -parallel. For the MapReduce tasks to divide the table space, the -results returned by the database must be orderable. Sqoop will -automatically detect the primary key for a table and use that to order -the results. If no primary key is available, or (less likely) you want -to order the results along a different column, you can specify the -column name with +--split-by+. - -.Row ordering -IMPORTANT: To guarantee correctness of your input, you must select an -ordering column for which each row has a unique value. If duplicate -values appear in the ordering column, the results of the import are -undefined, and Sqoop will not be able to detect the error. - -Finally, you can control which rows of a table are imported via the -+--where+ argument. With this argument, you may specify a clause to be -appended to the SQL statement used to select rows from the table, -e.g.: - ----- -$ sqoop --connect jdbc:mysql://database.example.com/employees \ - --table employee_names --where "employee_id > 40 AND active = 1" ----- - -The +--columns+, +--split-by+, and +--where+ arguments are incompatible with -+--all-tables+. If you require special handling for some of the tables, -then you must manually run a separate import job for each table. - diff --git a/src/docs/Sqoop-manpage.txt b/src/docs/user/Sqoop-manpage.txt similarity index 100% rename from src/docs/Sqoop-manpage.txt rename to src/docs/user/Sqoop-manpage.txt diff --git a/src/docs/user/SqoopUserGuide.txt b/src/docs/user/SqoopUserGuide.txt new file mode 100644 index 00000000..35ac4ec3 --- /dev/null +++ b/src/docs/user/SqoopUserGuide.txt @@ -0,0 +1,52 @@ + +//// + Licensed to Cloudera, Inc. under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +//// + + +include::intro.txt[] + +include::preface.txt[] + +include::basics.txt[] + +include::tools.txt[] + +include::import.txt[] + +include::import-all-tables.txt[] + +include::export.txt[] + +include::codegen.txt[] + +include::create-hive-table.txt[] + +include::eval.txt[] + +include::list-databases.txt[] + +include::list-tables.txt[] + +include::help.txt[] + +include::version.txt[] + +include::compatibility.txt[] + +include::support.txt[] + + diff --git a/src/docs/user/basics.txt b/src/docs/user/basics.txt new file mode 100644 index 00000000..aaef72f8 --- /dev/null +++ b/src/docs/user/basics.txt @@ -0,0 +1,63 @@ + +//// + Licensed to Cloudera, Inc. under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +//// + +Basic Usage +----------- + +With Sqoop, you can _import_ data from a relational database system into +HDFS. The input to the import process is a database table. Sqoop +will read the table row-by-row into HDFS. The output of this import +process is a set of files containing a copy of the imported table. +The import process is performed in parallel. For this reason, the +output will be in multiple files. These files may be delimited text +files (for example, with commas or tabs separating each field), or +binary SequenceFiles containing serialized record data. + +A by-product of the import process is a generated Java class which +can encapsulate one row of the imported table. This class is used +during the import process by Sqoop itself. The Java source code for +this class is also provided to you, for use in subsequent MapReduce +processing of the data. This class can serialize and deserialize data +to and from the SequenceFile format. It can also parse the +delimited-text form of a record. These abilities allow you to quickly +develop MapReduce applications that use the HDFS-stored records in +your processing pipeline. You are also free to parse the delimiteds +record data yourself, using any other tools you prefer. + +After manipulating the imported records (for example, with MapReduce +or Hive) you may have a result data set which you can then _export_ +back to the relational database. Sqoop's export process will read +a set of delimited text files from HDFS in parallel, parse them into +records, and insert them as new rows in a target database table, for +consumption by external applications or users. + +Sqoop includes some other commands which allow you to inspect the +database you are working with. For example, you can list the available +database schemas (with the +sqoop-list-databases+ tool) and tables +within a schema (with the +sqoop-list-tables+ tool). Sqoop also +includes a primitive SQL execution shell (the +sqoop-eval+ tool). + +Most aspects of the import, code generation, and export processes can +be customized. You can control the specific row range or columns imported. +You can specify particular delimiters and escape characters for the +file-based representation of the data, as well as the file format +used. You can also control the class or package names used in +generated code. Subsequent sections of this document explain how to +specify these and other arguments to Sqoop. + + diff --git a/src/docs/user/codegen-args.txt b/src/docs/user/codegen-args.txt new file mode 100644 index 00000000..178255af --- /dev/null +++ b/src/docs/user/codegen-args.txt @@ -0,0 +1,33 @@ + +//// + Licensed to Cloudera, Inc. under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +//// + + +.Code generation arguments: +[grid="all"] +`------------------------`----------------------------------------------- +Argument Description +------------------------------------------------------------------------- ++\--bindir + Output directory for compiled objects ++\--class-name + Sets the generated class name. This overrides\ + +\--package-name+. When combined with \ + +\--jar-file+, sets the input class. ++\--jar-file + Disable code generation; use specified jar ++\--outdir + Output directory for generated code ++\--package-name + Put auto-generated classes in this package +------------------------------------------------------------------------- + diff --git a/src/docs/user/codegen.txt b/src/docs/user/codegen.txt new file mode 100644 index 00000000..397c1170 --- /dev/null +++ b/src/docs/user/codegen.txt @@ -0,0 +1,83 @@ + +//// + Licensed to Cloudera, Inc. under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +//// + + ++sqoop-codegen+ +--------------- + + +Purpose +~~~~~~~ + +The +codegen+ tool generates Java classes which encapsulate and +interpret imported records. The Java definition of a record is +instantiated as part of the import process, but can also be performed +separately. For example, if Java source is lost, it can be recreated. +New versions of a class can be created which use different delimiters +between fields, and so on. + +Syntax +~~~~~~ + +---- +$ sqoop codegen (generic-args) (codegen-args) +$ sqoop-codegen (generic-args) (codegen-args) +---- + +Although the Hadoop generic arguments must preceed any codegen arguments, +the codegen arguments can be entered in any order with respect to one +another. + + +include::common-args.txt[] + +.Code generation arguments: +[grid="all"] +`------------------------`----------------------------------------------- +Argument Description +------------------------------------------------------------------------- ++\--bindir + Output directory for compiled objects ++\--class-name + Sets the generated class name. This overrides\ + +\--package-name+. ++\--outdir + Output directory for generated code ++\--package-name + Put auto-generated classes in this package ++\--table + Name of the table to generate code for. +------------------------------------------------------------------------- + +include::output-args.txt[] + +include::input-args.txt[] + +include::hive-args.txt[] + +If Hive arguments are provided to the code generation tool, Sqoop +generates a file containing the HQL statements to create a table and +load data. + +Example Invocations +~~~~~~~~~~~~~~~~~~~ + +Recreate the record interpretation code for the +employees+ table of a +corporate database: + +---- +$ sqoop codegen --connect jdbc:mysql://db.example.com/corp \ + --table employees +---- + + diff --git a/src/docs/user/common-args.txt b/src/docs/user/common-args.txt new file mode 100644 index 00000000..390f2c77 --- /dev/null +++ b/src/docs/user/common-args.txt @@ -0,0 +1,33 @@ + +//// + Licensed to Cloudera, Inc. under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +//// + +.Common arguments +[grid="all"] +`-------------------------`------------------------------------------ +Argument Description +--------------------------------------------------------------------- ++\--connect + Specify JDBC connect string ++\--driver + Manually specify JDBC driver class to use ++\--hadoop-home + Override $HADOOP_HOME ++\--help+ Print usage instructions ++-P+ Read password from console ++\--password + Set authentication password ++\--username + Set authentication username ++\--verbose+ Print more information while working +--------------------------------------------------------------------- + diff --git a/src/docs/user/compatibility.txt b/src/docs/user/compatibility.txt new file mode 100644 index 00000000..0b189abf --- /dev/null +++ b/src/docs/user/compatibility.txt @@ -0,0 +1,184 @@ + +//// + Licensed to Cloudera, Inc. under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +//// + +Compatibility Notes +------------------- + +Sqoop uses JDBC to connect to databases and adheres to +published standards as much as possible. For databases which do not +support standards-compliant SQL, Sqoop uses alternate codepaths to +provide functionality. In general, Sqoop is believed to be compatible +with a large number of databases, but it is tested with only a few. + +Nonetheless, several database-specific decisions were made in the +implementation of Sqoop, and some databases offer additional settings +which are extensions to the standard. + +This section describes the databases tested with Sqoop, any +exceptions in Sqoop's handling of each database relative to the +norm, and any database-specific settings available in Sqoop. + +Supported Databases +~~~~~~~~~~~~~~~~~~~ + +While JDBC is a compatibility layer that allows a program to access +many different databases through a common API, slight differences in +the SQL language spoken by each database may mean that Sqoop can't use +every database out of the box, or that some databases may be used in +an inefficient manner. + +When you provide a connect string to Sqoop, it inspects the protocol scheme to +determine appropriate vendor-specific logic to use. If Sqoop knows about +a given database, it will work automatically. If not, you may need to +specify the driver class to load via +\--driver+. This will use a generic +code path which will use standard SQL to access the database. Sqoop provides +some databases with faster, non-JDBC-based access mechanisms. These can be +enabled by specfying the +\--direct+ parameter. + +Sqoop includes vendor-specific support for the following databases: + +[grid="all"] +`-----------`--------`--------------------`--------------------- +Database version +\--direct+ support? connect string matches +---------------------------------------------------------------- +HSQLDB 1.8.0+ No +jdbc:hsqldb:*//+ +MySQL 5.0+ Yes +jdbc:mysql://+ +Oracle 10.2.0+ No +jdbc:oracle:*//+ +PostgreSQL 8.3+ Yes (import only) +jdbc:postgresql://+ +---------------------------------------------------------------- + +Sqoop may work with older versions of the databases listed, but we have +only tested it with the versions specified above. + +Even if Sqoop supports a database internally, you may still need to +install the database vendor's JDBC driver in your +$HADOOP_HOME/lib+ +path. + +MySQL +~~~~~ + +JDBC Driver: http://www.mysql.com/downloads/connector/j/[MySQL +Connector/J] + +MySQL v5.0 and above offers very thorough coverage by Sqoop. In builds +of Sqoop included with Cloudera's Distribution for Hadoop, the +Connector/J JDBC driver is included with the installation. + +zeroDateTimeBehavior +^^^^^^^^^^^^^^^^^^^^ + +MySQL allows values of +'0000-00-00'+ for +DATE+ columns, which is a +non-standard extension to SQL. When communicated via JDBC, these +values are handled in one of three different ways: + +- Convert to +NULL+. +- Throw an exception in the client. +- Round to the nearest legal date (+'0001-01-01'+). + +You specify the behavior by using the +zeroDateTimeBehavior+ +property of the connect string. If a +zeroDateTimeBehavior+ property +is not specified, Sqoop uses the +convertToNull+ behavior. + +You can override this behavior. For example: + +---- +$ sqoop import --table foo \ + --connect jdbc:mysql://db.example.com/someDb?zeroDateTimeBehavior=round +---- + ++UNSIGNED+ columns +^^^^^^^^^^^^^^^^^^ + +Columns with type +UNSIGNED+ in MySQL can hold values between 0 and +2^32 (+4294967295+), but the database will report the data type to Sqoop +as +INTEGER+, which will can hold values between +-2147483648+ and ++\+2147483647+. Sqoop cannot currently import +UNSIGNED+ values above ++2147483647+. + ++BLOB+ and +CLOB+ columns +^^^^^^^^^^^^^^^^^^^^^^^^^ + +Sqoop's direct mode does not support imports of +BLOB+, +CLOB+, or ++LONGVARBINARY+ columns. Use JDBC-based imports for these +columns; do not supply the +\--direct+ argument to the import tool. + + +Direct-mode Transactions +^^^^^^^^^^^^^^^^^^^^^^^^ + +For performance, each writer will commit the current transaction +approximately every 32 MB of exported data. You can control this +by specifying the following argument _before_ any tool-specific arguments: +-D +sqoop.mysql.export.checkpoint.bytes=size+, where _size_ is a value in +bytes. Set _size_ to 0 to disable intermediate checkpoints, +but individual files being exported will continue to be committed +independently of one another. + +IMPORTANT: Note that any arguments to Sqoop that are of the form +-D +parameter=value+ are Hadoop _generic arguments_ and must appear before +any tool-specific arguments (for example, +\--connect+, +\--table+, etc). + + +Oracle +~~~~~~ + +JDBC Driver: +http://www.oracle.com/technology/software/tech/java/sqlj_jdbc/htdocs/jdbc_112010.html[Oracle +JDBC Thin Driver] - Sqoop is compatible with +ojdbc6.jar+. + +Sqoop has been tested with Oracle 10.2.0 Express Edition. Oracle is +notable in its different approach to SQL from the ANSI standard, and +its non-standard JDBC driver. Therefore, several features work +differently. + +Dates and Times +^^^^^^^^^^^^^^^ + +Oracle JDBC represents +DATE+ and +TIME+ SQL types as +TIMESTAMP+ +values. Any +DATE+ columns in an Oracle database will be imported as a ++TIMESTAMP+ in Sqoop, and Sqoop-generated code will store these values +in +java.sql.Timestamp+ fields. + +When exporting data back to a database, Sqoop parses text fields as ++TIMESTAMP+ types (with the form +yyyy-mm-dd HH:MM:SS.ffffffff+) even +if you expect these fields to be formatted with the JDBC date escape +format of +yyyy-mm-dd+. Dates exported to Oracle should be formatted +as full timestamps. + +Oracle also includes the additional date/time types +TIMESTAMP WITH +TIMEZONE+ and +TIMESTAMP WITH LOCAL TIMEZONE+. To support these types, +the user's session timezone must be specified. By default, Sqoop will +specify the timezone +"GMT"+ to Oracle. You can override this setting +by specifying a Hadoop property +oracle.sessionTimeZone+ on the +command-line when running a Sqoop job. For example: + +---- +$ sqoop import -D oracle.sessionTimeZone=America/Los_Angeles \ + --connect jdbc:oracle:thin:@//db.example.com/foo --table bar +---- + +Note that Hadoop parameters (+-D ...+) are _generic arguments_ and +must appear before the tool-specific arguments (+\--connect+, ++\--table+, and so on). + +Legal values for the session timezone string are enumerated at +http://download-west.oracle.com/docs/cd/B19306_01/server.102/b14225/applocaledata.htm#i637736[]. + + +include::hive-notes.txt[] + diff --git a/src/docs/connecting.txt b/src/docs/user/connecting.txt similarity index 53% rename from src/docs/connecting.txt rename to src/docs/user/connecting.txt index b12fc99c..1b3f95f5 100644 --- a/src/docs/connecting.txt +++ b/src/docs/user/connecting.txt @@ -18,68 +18,70 @@ Connecting to a Database Server -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Sqoop is designed to import tables from a database into HDFS. As such, -it requires a _connect string_ that describes how to connect to the -database. The _connect string_ looks like a URL, and is communicated to -Sqoop with the +--connect+ argument. This describes the server and -database to connect to; it may also specify the port. e.g.: +Sqoop is designed to import tables from a database into HDFS. To do +so, you must specify a _connect string_ that describes how to connect to the +database. The _connect string_ is similar to a URL, and is communicated to +Sqoop with the +\--connect+ argument. This describes the server and +database to connect to; it may also specify the port. For example: ---- -$ sqoop --connect jdbc:mysql://database.example.com/employees +$ sqoop import --connect jdbc:mysql://database.example.com/employees ---- This string will connect to a MySQL database named +employees+ on the host +database.example.com+. It's important that you *do not* use the URL +localhost+ if you intend to use Sqoop with a distributed Hadoop cluster. The connect string you supply will be used on TaskTracker nodes -throughout your MapReduce cluster; if they're told to connect to the -literal name +localhost+, they'll each reach a different -database (or more likely, no database at all)! Instead, you should use +throughout your MapReduce cluster; if you specify the +literal name +localhost+, each node will connect to a different +database (or more likely, no database at all). Instead, you should use the full hostname or IP address of the database host that can be seen by all your remote nodes. -You may need to authenticate against the database before you can -access it. The +--username+ and +--password+ or +-P+ parameters can -be used to supply a username and a password to the database. e.g.: +You might need to authenticate against the database before you can +access it. You can use the +\--username+ and +\--password+ or +-P+ parameters +to supply a username and a password to the database. For example: ---- -$ sqoop --connect jdbc:mysql://database.example.com/employees \ +$ sqoop import --connect jdbc:mysql://database.example.com/employees \ --username aaron --password 12345 ---- .Password security -WARNING: The +--password+ parameter is insecure, as other users may +WARNING: The +\--password+ parameter is insecure, as other users may be able to read your password from the command-line arguments via the output of programs such as `ps`. The *+-P+* argument will read a password from a console prompt, and is the preferred method of entering credentials. Credentials may still be transferred between nodes of the MapReduce cluster using insecure means. -Sqoop automatically supports several databases, including MySQL. Connect strings beginning -with +jdbc:mysql://+ are handled automatically Sqoop, though you may need -to install the driver yourself. (A full list of databases with -built-in support is provided in the "Supported Databases" section, below.) +Sqoop automatically supports several databases, including MySQL. +Connect strings beginning with +jdbc:mysql://+ are handled +automatically in Sqoop, though you may need to install the driver +yourself. (A full list of databases with built-in support is provided +in the "Supported Databases" section.) You can use Sqoop with any other -JDBC-compliant database as well. First, download the appropriate JDBC -driver for the database you want to import from, and install the .jar +JDBC-compliant database. First, download the appropriate JDBC +driver for the type of database you want to import, and install the .jar file in the +/usr/hadoop/lib+ directory on all machines in your Hadoop cluster, or some other directory which is in the classpath -on all nodes. Each driver jar also has a specific driver class which defines +on all nodes. Each driver +.jar+ file also has a specific driver class which defines the entry-point to the driver. For example, MySQL's Connector/J library has a driver class of +com.mysql.jdbc.Driver+. Refer to your database vendor-specific documentation to determine the main driver class. -This class must be provided as an argument to Sqoop with +--driver+. +This class must be provided as an argument to Sqoop with +\--driver+. -For example, to connect to a postgres database, first download the driver from -link:http://jdbc.postgresql.org[http://jdbc.postgresql.org] and -install it in your Hadoop lib path. -Then run Sqoop with something like: +For example, to connect to a SQLServer database, first download the driver from +microsoft.com and install it in your Hadoop lib path. + +Then run Sqoop. For example: ---- -$ sqoop --connect jdbc:postgresql://postgres-server.example.com/employees \ - --driver org.postgresql.Driver +$ sqoop import --driver com.microsoft.jdbc.sqlserver.SQLServerDriver \ + --connect ... ---- + diff --git a/src/docs/controlling-input-format.txt b/src/docs/user/controlling-input-format.txt similarity index 84% rename from src/docs/controlling-input-format.txt rename to src/docs/user/controlling-input-format.txt index 348f9daf..b0ff7677 100644 --- a/src/docs/controlling-input-format.txt +++ b/src/docs/user/controlling-input-format.txt @@ -29,14 +29,14 @@ include::input-formatting-args.txt[] If you have already imported data into HDFS in a text-based representation and want to change the delimiters being used, you -should regenerate the class via `sqoop --generate-only`, specifying -the new delimiters with +--fields-terminated-by+, etc., and the old -delimiters with +--input-fields-terminated-by+, etc. Then run a +should regenerate the class via `sqoop \--generate-only`, specifying +the new delimiters with +\--fields-terminated-by+, etc., and the old +delimiters with +\--input-fields-terminated-by+, etc. Then run a MapReduce job where your mapper creates an instance of your record class, uses its +parse()+ method to read the fields using the old delimiters, and emits a new +Text+ output value via the record's +toString()+ method, which will use the new delimiters. You'll then want to regenerate the class another time without the -+--input-fields-terminated-by+ specified so that the new delimiters ++\--input-fields-terminated-by+ specified so that the new delimiters are used for both input and output. diff --git a/src/docs/controlling-output-format.txt b/src/docs/user/controlling-output-format.txt similarity index 100% rename from src/docs/controlling-output-format.txt rename to src/docs/user/controlling-output-format.txt diff --git a/src/docs/user/create-hive-table.txt b/src/docs/user/create-hive-table.txt new file mode 100644 index 00000000..c120bb06 --- /dev/null +++ b/src/docs/user/create-hive-table.txt @@ -0,0 +1,82 @@ + +//// + Licensed to Cloudera, Inc. under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +//// + + ++sqoop-create-hive-table+ +------------------------- + + +Purpose +~~~~~~~ + +The +create-hive-table+ tool populates a Hive metastore with a +definition for a table based on a database table previously imported +to HDFS, or one planned to be imported. This effectively performs the +"+\--hive-import+" step of +sqoop-import+ without running the +preceeding import. + +If data was already loaded to HDFS, you can use this tool to finish +the pipeline of importing the data to Hive. You can also create Hive tables +with this tool; data then can be imported and populated into +the target after a preprocessing step run by the user. + +Syntax +~~~~~~ + +---- +$ sqoop create-hive-table (generic-args) (create-hive-table-args) +$ sqoop-create-hive-table (generic-args) (create-hive-table-args) +---- + +Although the Hadoop generic arguments must preceed any create-hive-table +arguments, the create-hive-table arguments can be entered in any order +with respect to one another. + + +include::common-args.txt[] + +.Hive arguments: +[grid="all"] +`-----------------------------`------------------------------------------- +Argument Description +-------------------------------------------------------------------------- ++\--hive-home + Override +$HIVE_HOME+ ++\--hive-overwrite+ Overwrite existing data in the Hive table. ++\--hive-table + Sets the table name to use when importing \ + to Hive. ++\--table+ The database table to read the \ + definition from. +-------------------------------------------------------------------------- + +include::output-args.txt[] + +Do not use enclosed-by or escaped-by delimiters with output formatting +arguments used to import to Hive. Hive cannot currently parse them. + +Example Invocations +~~~~~~~~~~~~~~~~~~~ + +Define in Hive a table named +emps+ with a definition based on a +database table named +employees+: + +---- +$ sqoop create-hive-table --connect jdbc:mysql://db.example.com/corp \ + --table employees --hive-table emps +---- + + diff --git a/src/docs/user/eval.txt b/src/docs/user/eval.txt new file mode 100644 index 00000000..ea078618 --- /dev/null +++ b/src/docs/user/eval.txt @@ -0,0 +1,65 @@ + +//// + Licensed to Cloudera, Inc. under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +//// + + ++sqoop-eval+ +------------ + + +Purpose +~~~~~~~ + +The +eval+ tool allows users to quickly run simple SQL queries against +a database; results are printed to the console. This allows users to +preview their import queries to ensure they import the data they +expect. + +Syntax +~~~~~~ + +---- +$ sqoop eval (generic-args) (eval-args) +$ sqoop-eval (generic-args) (eval-args) +---- + +Although the Hadoop generic arguments must preceed any eval arguments, +the eval arguments can be entered in any order with respect to one +another. + + +include::common-args.txt[] + +.SQL evaluation arguments: +[grid="all"] +`-----------------------------`------------------------------------------- +Argument Description +-------------------------------------------------------------------------- ++-e,\--query + Execute '+statement+' in SQL. +-------------------------------------------------------------------------- + +Example Invocations +~~~~~~~~~~~~~~~~~~~ + +Select ten records from the +employees+ table: + +---- +$ sqoop eval --connect jdbc:mysql://db.example.com/corp \ + --query "SELECT * FROM employees LIMIT 10" +---- + + diff --git a/src/docs/user/export.txt b/src/docs/user/export.txt new file mode 100644 index 00000000..ec2300dd --- /dev/null +++ b/src/docs/user/export.txt @@ -0,0 +1,153 @@ + +//// + Licensed to Cloudera, Inc. under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +//// + + ++sqoop-export+ +-------------- + + +Purpose +~~~~~~~ + +The +export+ tool exports a set of files from HDFS back to an RDBMS. +The target table must already exist in the database. The input files +are read and parsed into a set of records according to the +user-specified delimiters. These are then transformed into a set of ++INSERT+ statements that inject the records into the database. + +Syntax +~~~~~~ + +---- +$ sqoop export (generic-args) (import-args) +$ sqoop-export (generic-args) (import-args) +---- + +Although the Hadoop generic arguments must preceed any export arguments, +the export arguments can be entered in any order with respect to one +another. + + +include::common-args.txt[] + +.Export control arguments: +[grid="all"] +`-------------------------`------------------------------------------ +Argument Description +--------------------------------------------------------------------- ++\--direct+ Use direct export fast path ++\--export-dir + HDFS source path for the export ++-m,\--num-mappers + Use 'n' map tasks to export in parallel ++\--table + Table to populate +--------------------------------------------------------------------- + +The +\--table+ and +\--export-dir+ arguments are required. These +specify the table to populate in the database, and the +directory in HDFS that contains the source data. + +You can control the number of mappers independently from the number of +files present in the directory. Export performance depends on the +degree of parallelism. By default, Sqoop will use four tasks in +parallel for the export process. This may not be optimal; you will +need to experiment with your own particular setup. Additional tasks +may offer better concurrency, but if the database is already +bottlenecked on updating indices, invoking triggers, and so on, then +additional load may decrease performance. The +\--num-mappers+ or +-m+ +arguments control the number of map tasks, which is the degree of +parallelism used. + +MySQL provides a direct mode for exports as well, using the ++mysqlimport+ tool. When exporting to MySQL, use the +\--direct+ argument +to specify this codepath. This may be +higher-performance than the standard JDBC codepath. + +include::input-args.txt[] + +include::output-args.txt[] + +Sqoop automatically generates code to parse and interpret records of the +files containing the data to be exported back to the database. If +these files were created with non-default delimiters (comma-separated +fields with newline-separated records), you should specify +the same delimiters again so that Sqoop can parse your files. + +If you specify incorrect delimiters, Sqoop will fail to find enough +columns per line. This will cause export map tasks to fail by throwing ++ParseExceptions+. + +include::codegen-args.txt[] + +If the records to be exported were generated as the result of a +previous import, then the original generated class can be used to read +the data back. Specifying +\--jar-file+ and +\--class-name+ obviate +the need to specify delimiters in this case. + +Exports and Transactions +~~~~~~~~~~~~~~~~~~~~~~~~ + +Exports are performed by multiple writers in parallel. Each writer +uses a separate connection to the database; these have separate +transactions from one another. Sqoop uses the multi-row +INSERT+ +syntax to insert up to 100 records per statement. Every 100 +statements, the current transaction within a writer task is committed, +causing a commit every 10,000 rows. This ensures that transaction +buffers do not grow without bound, and cause out-of-memory conditions. +Therefore, an export is not an atomic process. Partial results from +the export will become visible before the export is complete. + +Failed Exports +~~~~~~~~~~~~~~ + +Exports may fail for a number of reasons: + +- Loss of connectivity from the Hadoop cluster to the database (either + due to hardware fault, or server software crashes) +- Attempting to +INSERT+ a row which violates a consistency constraint + (for example, inserting a duplicate primary key value) +- Attempting to parse an incomplete or malformed record from the HDFS + source data +- Attempting to parse records using incorrect delimiters +- Capacity issues (such as insufficient RAM or disk space) + +If an export map task fails due to these or other reasons, it will +cause the export job to fail. The results of a failed export are +undefined. Each export map task operates in a separate transaction. +Furthermore, individual map tasks +commit+ their current transaction +periodically. If a task fails, the current transaction will be rolled +back. Any previously-committed transactions will remain durable in the +database, leading to a partially-complete export. + +Example Invocations +~~~~~~~~~~~~~~~~~~~ + +A basic export to populate a table named +bar+: + +---- +$ sqoop export --connect jdbc:mysql://db.example.com/foo --table bar \ + --export-dir /results/bar_data +---- + +This example takes the files in +/results/bar_data+ and injects their +contents in to the +bar+ table in the +foo+ database on +db.example.com+. +The target table must already exist in the database. Sqoop performs +a set of +INSERT INTO+ operations, without regard for existing content. If +Sqoop attempts to insert rows which violate constraints in the database +(for example, a particular primary key value already exists), then the export +fails. + + diff --git a/src/docs/user/help.txt b/src/docs/user/help.txt new file mode 100644 index 00000000..3b8aceb5 --- /dev/null +++ b/src/docs/user/help.txt @@ -0,0 +1,82 @@ + +//// + Licensed to Cloudera, Inc. under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +//// + + ++sqoop-help+ +------------ + +Purpose +~~~~~~~ + +List tools available in Sqoop and explain their usage. + +Syntax +~~~~~~ + +---- +$ sqoop help [tool-name] +$ sqoop-help [tool-name] +---- + +If no tool name is provided (for example, the user runs +sqoop help+), then +the available tools are listed. With a tool name, the usage +instructions for that specific tool are presented on the console. + +Example Invocations +~~~~~~~~~~~~~~~~~~~ + +List available tools: + +---- +$ sqoop help +usage: sqoop COMMAND [ARGS] + +Available commands: + codegen Generate code to interact with database records + create-hive-table Import a table definition into Hive + eval Evaluate a SQL statement and display the results + export Export an HDFS directory to a database table + +... + +See 'sqoop help COMMAND' for information on a specific command. +---- + +Display usage instructions for the +import+ tool: + +---- +$ bin/sqoop help import +usage: sqoop import [GENERIC-ARGS] [TOOL-ARGS] + +Common arguments: + --connect Specify JDBC connect string + --driver Manually specify JDBC driver class to use + --hadoop-home Override $HADOOP_HOME + --help Print usage instructions +-P Read password from console + --password Set authentication password + --username Set authentication username + --verbose Print more information while working + +Import control arguments: + --as-sequencefile Imports data to SequenceFiles + --as-textfile Imports data as plain text (default) +... +---- + + diff --git a/src/docs/user/hive-args.txt b/src/docs/user/hive-args.txt new file mode 100644 index 00000000..83c7ae2d --- /dev/null +++ b/src/docs/user/hive-args.txt @@ -0,0 +1,32 @@ + +//// + Licensed to Cloudera, Inc. under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +//// + + +.Hive arguments: +[grid="all"] +`-----------------------------`------------------------------------------- +Argument Description +-------------------------------------------------------------------------- ++\--hive-home + Override +$HIVE_HOME+ ++\--hive-import+ Import tables into Hive (Uses Hive's \ + default delimiters if none are set.) ++\--hive-overwrite+ Overwrite existing data in the Hive table. ++\--hive-table + Sets the table name to use when importing\ + to Hive. +-------------------------------------------------------------------------- + diff --git a/src/docs/user/hive-notes.txt b/src/docs/user/hive-notes.txt new file mode 100644 index 00000000..11fb36d8 --- /dev/null +++ b/src/docs/user/hive-notes.txt @@ -0,0 +1,30 @@ + +//// + Licensed to Cloudera, Inc. under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +//// + + + +Schema Definition in Hive +~~~~~~~~~~~~~~~~~~~~~~~~~ + +Hive users will note that there is not a one-to-one mapping between +SQL types and Hive types. In general, SQL types that do not have a +direct mapping (for example, +DATE+, +TIME+, and +TIMESTAMP+) will be coerced to ++STRING+ in Hive. The +NUMERIC+ and +DECIMAL+ SQL types will be coerced to ++DOUBLE+. In these cases, Sqoop will emit a warning in its log messages +informing you of the loss of precision. + diff --git a/src/docs/user/hive.txt b/src/docs/user/hive.txt new file mode 100644 index 00000000..13608dbc --- /dev/null +++ b/src/docs/user/hive.txt @@ -0,0 +1,59 @@ + +//// + Licensed to Cloudera, Inc. under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +//// + + +Importing Data Into Hive +^^^^^^^^^^^^^^^^^^^^^^^^ + +Sqoop's import tool's main function is to upload your data into files +in HDFS. If you have a Hive metastore associated with your HDFS +cluster, Sqoop can also import the data into Hive by generating and +executing a +CREATE TABLE+ statement to define the data's layout in +Hive. Importing data into Hive is as simple as adding the +*+\--hive-import+* option to your Sqoop command line. + +If the Hive table already exists, you can specify the +*+\--hive-overwrite+* option to indicate that existing table in hive must +be replaced. After your data is imported into HDFS or this step is +omitted, Sqoop will generate a Hive script containing a +CREATE TABLE+ +operation defining your columns using Hive's types, and a +LOAD DATA INPATH+ +statement to move the data files into Hive's warehouse directory. + +The script will be executed by calling +the installed copy of hive on the machine where Sqoop is run. If you have +multiple Hive installations, or +hive+ is not in your +$PATH+, use the +*+\--hive-home+* option to identify the Hive installation directory. +Sqoop will use +$HIVE_HOME/bin/hive+ from here. + +NOTE: This function is incompatible with +\--as-sequencefile+. + +Hive's text parser does not support escaping or enclosing +characters. Sqoop will print a warning if you use +\--escaped-by+, ++\--enclosed-by+, or +\--optionally-enclosed-by+ because Hive does not know +how to parse these. It will pass the field and record delimiters through +to Hive. If you do not set any delimiters and do use +\--hive-import+, +the field delimiter will be set to +^A+ and the record delimiter will +be set to +\n+ to be consistent with Hive's defaults. It is important when +importing data to Hive to choose unambiguous field and record delimiters +due to the lack of escape and enclosing characters. + +The table name used in Hive is, by default, the same as that of the +source table. You can control the output table name with the +\--hive-table+ +option. + + diff --git a/src/docs/user/import-all-tables.txt b/src/docs/user/import-all-tables.txt new file mode 100644 index 00000000..bc95c484 --- /dev/null +++ b/src/docs/user/import-all-tables.txt @@ -0,0 +1,112 @@ + +//// + Licensed to Cloudera, Inc. under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +//// + + ++sqoop-import-all-tables+ +------------------------- + +Purpose +~~~~~~~ + +The +import-all-tables+ tool imports a set of tables from an RDBMS to HDFS. +Data from each table is stored in a separate directory in HDFS. + +For the +import-all-tables+ tool to be useful, the following conditions +must be met: + +- Each table must have a single-column primary key. +- You must intend to import all columns of each table. +- You must not intend to use non-default splitting column, nor impose + any conditions via a +WHERE+ clause. + +Syntax +~~~~~~ + +---- +$ sqoop import-all-tables (generic-args) (import-args) +$ sqoop-import-all-tables (generic-args) (import-args) +---- + +Although the Hadoop generic arguments must preceed any import arguments, +the import arguments can be entered in any order with respect to one +another. + +include::common-args.txt[] + +.Import control arguments: +[grid="all"] +`----------------------------`--------------------------------------- +Argument Description +--------------------------------------------------------------------- ++\--as-sequencefile+ Imports data to SequenceFiles ++\--as-textfile+ Imports data as plain text (default) ++\--direct+ Use direct import fast path ++\--direct-split-size + Split the input stream every 'n' bytes when\ + importing in direct mode ++\--inline-lob-limit + Set the maximum size for an inline LOB ++-m,\--num-mappers + Use 'n' map tasks to import in parallel ++\--warehouse-dir + HDFS parent for table destination ++-z,\--compress+ Enable compression +--------------------------------------------------------------------- + +These arguments behave in the same manner as they do when used for the ++sqoop-import+ tool, but the +\--table+, +\--split-by+, +\--columns+, +and +\--where+ arguments are invalid for +sqoop-import-all-tables+. + +include::output-args.txt[] + +include::input-args.txt[] + +include::hive-args.txt[] + +.Code generation arguments: +[grid="all"] +`------------------------`----------------------------------------------- +Argument Description +------------------------------------------------------------------------- ++\--bindir + Output directory for compiled objects ++\--jar-file + Disable code generation; use specified jar ++\--outdir + Output directory for generated code ++\--package-name + Put auto-generated classes in this package +------------------------------------------------------------------------- + +The +import-all-tables+ tool does not support the +\--class-name+ argument. +You may, however, specify a package with +\--package-name+ in which all +generated classes will be placed. + +Example Invocations +~~~~~~~~~~~~~~~~~~~ + +Import all tables from the +corp+ database: + +---- +$ sqoop import-all-tables --connect jdbc:mysql://db.foo.com/corp +---- + +Verifying that it worked: + +---- +$ hadoop fs -ls +Found 4 items +drwxr-xr-x - someuser somegrp 0 2010-04-27 17:15 /user/someuser/EMPLOYEES +drwxr-xr-x - someuser somegrp 0 2010-04-27 17:15 /user/someuser/PAYCHECKS +drwxr-xr-x - someuser somegrp 0 2010-04-27 17:15 /user/someuser/DEPARTMENTS +drwxr-xr-x - someuser somegrp 0 2010-04-27 17:15 /user/someuser/OFFICE_SUPPLIES +---- + + diff --git a/src/docs/user/import.txt b/src/docs/user/import.txt new file mode 100644 index 00000000..fc7e5063 --- /dev/null +++ b/src/docs/user/import.txt @@ -0,0 +1,500 @@ + +//// + Licensed to Cloudera, Inc. under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +//// + + ++sqoop-import+ +-------------- + +Purpose +~~~~~~~ + +The +import+ tool imports an individual table from an RDBMS to HDFS. +Each row from a table is represented as a separate record in HDFS. +Records can be stored as text files (one record per line), or in +binary representation in SequenceFiles. + +Syntax +~~~~~~ + +---- +$ sqoop import (generic-args) (import-args) +$ sqoop-import (generic-args) (import-args) +---- + +While the Hadoop generic arguments must preceed any import arguments, +you can type the import arguments in any order with respect to one +another. + +NOTE: In this document, arguments are grouped into collections +organized by function. Some collections are present in several tools +(for example, the "common" arguments). An extended description of their +functionality is given only on the first presentation in this +document. + +include::common-args.txt[] + +include::connecting.txt[] + +.Import control arguments: +[grid="all"] +`-----------------------------`-------------------------------------- +Argument Description +--------------------------------------------------------------------- ++\--as-sequencefile+ Imports data to SequenceFiles ++\--as-textfile+ Imports data as plain text (default) ++\--columns + Columns to import from table ++\--direct+ Use direct import fast path ++\--direct-split-size + Split the input stream every 'n' bytes\ + when importing in direct mode ++\--inline-lob-limit + Set the maximum size for an inline LOB ++-m,\--num-mappers + Use 'n' map tasks to import in parallel ++\--split-by + Column of the table used to split work\ + units ++\--table + Table to read ++\--warehouse-dir + HDFS parent for table destination ++\--where + WHERE clause to use during import ++-z,\--compress+ Enable compression +--------------------------------------------------------------------- + + + +Selecting the Data to Import +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Sqoop currently imports data in a table-centric fashion. Use the ++\--table+ argument to select the table to import. For example, +\--table +employees+. This argument can also identify a +VIEW+ or other table-like +entity in a database. + +By default, all columns within a table are selected for import. +Imported data is written to HDFS in its "natural order;" that is, a +table containing columns A, B, and C result in an import of data such +as: + +---- +A1,B1,C1 +A2,B2,C2 +... +---- + +You can select a subset of columns and control their ordering by using +the +\--columns+ argument. This should include a comma-delimited list +of columns to import. For example: +\--columns "name,employee_id,jobtitle"+. + +You can control which rows are imported by adding a SQL +WHERE+ clause +to the import statement. By default, Sqoop generates statements of the +form +SELECT FROM +. You can append a ++WHERE+ clause to this with the +\--where+ argument. For example: +\--where +"id > 400"+. Only rows where the +id+ column has a value greater than +400 will be imported. + +Controlling Parallelism +^^^^^^^^^^^^^^^^^^^^^^^ + +Sqoop imports data in parallel from most database sources. You can +specify the number +of map tasks (parallel processes) to use to perform the import by +using the +-m+ or +\--num-mappers+ argument. Each of these arguments +takes an integer value which corresponds to the degree of parallelism +to employ. By default, four tasks are used. Some databases may see +improved performance by increasing this value to 8 or 16. Do not +increase the degree of parallelism greater than that available within +your MapReduce cluster; tasks will run serially and will likely +increase the amount of time required to perform the import. Likewise, +do not increase the degree of parallism higher than that which your +database can reasonably support. Connecting 100 concurrent clients to +your database may increase the load on the database server to a point +where performance suffers as a result. + +When performing parallel imports, Sqoop needs a criterion by which it +can split the workload. Sqoop uses a _splitting column_ to split the +workload. By default, Sqoop will identify the primary key column (if +present) in a table and use it as the splitting column. The low and +high values for the splitting column are retrieved from the database, +and the map tasks operate on evenly-sized components of the total +range. For example, if you had a table with a primary key column of ++id+ whose minimum value was 0 and maximum value was 1000, and Sqoop +was directed to use 4 tasks, Sqoop would run four processes which each +execute SQL statements of the form +SELECT * FROM sometable WHERE id +>= lo AND id < hi+, with +(lo, hi)+ set to (0, 250), (250, 500), +(500, 750), and (750, 1001) in the different tasks. + +If the actual values for the primary key are not uniformly distributed +across its range, then this can result in unbalanced tasks. You should +explicitly choose a different column with the +\--split-by+ argument. +For example, +\--split-by employee_id+. Sqoop cannot currently split on +multi-column indices. If your table has no index column, or has a +multi-column key, then you must also manually choose a splitting +column. + +Controlling the Import Process +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +By default, the import process will use JDBC which provides a +reasonable cross-vendor import channel. Some databases can perform +imports in a more high-performance fashion by using database-specific +data movement tools. For example, MySQL provides the +mysqldump+ tool +which can export data from MySQL to other systems very quickly. By +supplying the +\--direct+ argument, you are specifying that Sqoop +should attempt the direct import channel. This channel may be +higher performance than using JDBC. Currently, direct mode does not +support imports of large object columns. + +When importing from PostgreSQL in conjunction with direct mode, you +can split the import into separate files after +individual files reach a certain size. This size limit is controlled +with the +\--direct-split-size+ argument. + +By default, Sqoop will import a table named +foo+ to a directory named ++foo+ inside your home directory in HDFS. For example, if your +username is +someuser+, then the import tool will write to ++/user/someuser/foo/(files)+. You can adjust the parent directory of +the import with the +\--warehouse-dir+ argument. For example: + +---- +$ sqoop import --connnect --table foo --warehouse-dir /shared \ + ... +---- + +This command would write to a set of files in the +/shared/foo/+ directory. + +When using direct mode, you can specify additional arguments which +should be passed to the underlying tool. If the argument ++\--+ is given on the command-line, then subsequent arguments are sent +directly to the underlying tool. For example, the following adjusts +the character set used by +mysqldump+: + +---- +$ sqoop import --connect jdbc:mysql://server.foo.com/db --table bar \ + --direct -- --default-character-set=latin1 +---- + +File Formats +^^^^^^^^^^^^ + +You can import data in one of two file formats: delimited text or +SequenceFiles. + +Delimited text is the default import format. You can also specify it +explicitly by using the +\--as-textfile+ argument. This argument will write +string-based representations of each record to the output files, with +delimiter characters between individual columns and rows. These +delimiters may be commas, tabs, or other characters. (The delimiters +can be selected; see "Output line formatting arguments.") The +following is the results of an example text-based import: + +---- +1,here is a message,2010-05-01 +2,happy new year!,2010-01-01 +3,another message,2009-11-12 +---- + +Delimited text is appropriate for most non-binary data types. It also +readily supports further manipulation by other tools, such as Hive. + +SequenceFiles are a binary format that store individual records in +custom record-specific data types. These data types are manifested as +Java classes. Sqoop will automatically generate these data types for +you. This format supports exact storage of all data in binary +representations, and is appropriate for storing binary data +(for example, +VARBINARY+ columns), or data that will be principly +manipulated by custom MapReduce programs (reading from SequenceFiles +is higher-performance than reading from text files, as records do not +need to be parsed). + +By default, data is not compressed. You can compress +your data by using the deflate (gzip) algorithm with the +-z+ or ++\--compress+ argument. This applies to both SequenceFiles or text +files. + +Large Objects +^^^^^^^^^^^^^ + +Sqoop handles large objects (+BLOB+ and +CLOB+ columns) in particular +ways. If this data is truly large, then these columns should not be +fully materialized in memory for manipulation, as most columns are. +Instead, their data is handled in a streaming fashion. Large objects +can be stored inline with the rest of the data, in which case they are +fully materialized in memory on every access, or they can be stored in +a secondary storage file linked to the primary data storage. By +default, large objects less than 16 MB in size are stored inline with +the rest of the data. At a larger size, they are stored in files in +the +_lobs+ subdirectory of the import target directory. These files +are stored in a separate format optimized for large record storage, +which can accomodate records of up to 2^63 bytes each. The size at +which lobs spill into separate files is controlled by the ++\--inline-lob-limit+ argument, which takes a parameter specifying the +largest lob size to keep inline, in bytes. If you set the inline LOB +limit to 0, all large objects will be placed in external +storage. + +include::output-args.txt[] + +When importing to delimited files, the choice of delimiter is +important. Delimiters which appear inside string-based fields may +cause ambiguous parsing of the imported data by subsequent analysis +passes. For example, the string +"Hello, pleased to meet you"+ should +not be imported with the end-of-field delimiter set to a comma. + +Delimiters may be specified as: + +- a character (+\--fields-terminated-by X+) +- an escape character (+\--fields-terminated-by \t+). Supported escape + characters are: +* +\b+ (backspace) +* +\n+ (newline) +* +\r+ (carriage return) +* +\t+ (tab) +* +\"+ (double-quote) +* +\\'+ (single-quote) +* +\\+ (backslash) +* +\0+ (NUL) - This will insert NUL characters between fields or lines, + or will disable enclosing/escaping if used for one of the +\--enclosed-by+, + +\--optionally-enclosed-by+, or +\--escaped-by+ arguments. +- The octal representation of a A UTF-8 character's code point. This + should be of the form +\0ooo+, where _ooo_ is the octal value. + For example, +\--fields-terminated-by \001+ would yield the +^A+ character. +- The hexadecimal representation of a A UTF-8 character's code point. This + should be of the form +\0xhhh+, where _hhh_ is the hex value. + For example, +\--fields-terminated-by \0x10+ would yield the carriage + return character. + +The default delimiters are a comma (+,+) for fields, a newline (+\n+) for records, no quote +character, and no escape character. Note that this can lead to +ambiguous/unparsible records if you import database records containing +commas or newlines in the field data. For unambiguous parsing, both must +be enabled. For example, via +\--mysql-delimiters+. + +If unambiguous delimiters cannot be presented, then use _enclosing_ and +_escaping_ characters. The combination of (optional) +enclosing and escaping characters will allow unambiguous parsing of +lines. For example, suppose one column of a dataset contained the +following values: + +---- +Some string, with a comma. +Another "string with quotes" +---- + +The following arguments would provide delimiters which can be +unambiguously parsed: + +---- +$ sqoop import --fields-terminated-by , --escaped-by \\ --enclosed-by '\"' ... +---- + +(Note that to prevent the shell from mangling the enclosing character, +we have enclosed that argument itself in single-quotes.) + +The result of the above arguments applied to the above dataset would +be: + +---- +"Some string, with a comma.","1","2","3"... +"Another \"string with quotes\"","4","5","6"... +---- + +Here the imported strings are shown in the context of additional +columns (+"1","2","3"+, etc.) to demonstrate the full effect of enclosing +and escaping. The enclosing character is only strictly necessary when +delimiter characters appear in the imported text. The enclosing +character can therefore be specified as optional: + +---- +$ sqoop import --optionally-enclosed-by '\"' (the rest as above)... +---- + +Which would result in the following import: + +---- +"Some string, with a comma.",1,2,3... +"Another \"string with quotes\"",4,5,6... +---- + +NOTE: Hive does not support enclosing and escaping characters. You +must choose unambiguous field and record-terminating delimiters +without the help of escaping and enclosing characters when +working with Hive; this is a limitation of Hive's input parsing +abilities. + +The +\--mysql-delimiters+ argument is a shorthand argument which uses +the default delimiters for the +mysqldump+ program. +If you use the +mysqldump+ delimiters in conjunction with a +direct-mode import (with +\--direct+), very fast imports can be +achieved. + +While the choice of delimiters is most important for a text-mode +import, it is still relevant if you import to SequenceFiles with ++\--as-sequencefile+. The generated class' +toString()+ method +will use the delimiters you specify, so subsequent formatting of +the output data will rely on the delimiters you choose. + +include::input-args.txt[] + +When Sqoop imports data to HDFS, it generates a Java class which can +reinterpret the text files that it creates when doing a +delimited-format import. The delimiters are chosen with arguments such +as +\--fields-terminated-by+; this controls both how the data is +written to disk, and how the generated +parse()+ method reinterprets +this data. The delimiters used by the +parse()+ method can be chosen +independently of the output arguments, by using ++\--input-fields-terminated-by+, and so on. This is useful, for example, to +generate classes which can parse records created with one set of +delimiters, and emit the records to a different set of files using a +separate set of delimiters. + +include::hive-args.txt[] + +include::hive.txt[] + +include::codegen-args.txt[] + +As mentioned earlier, a byproduct of importing a table to HDFS is a +class which can manipulate the imported data. If the data is stored in +SequenceFiles, this class will be used for the data's serialization +container. Therefore, you should use this class in your subsequent +MapReduce processing of the data. + +The class is typically named after the table; a table named +foo+ will +generate a class named +foo+. You may want to override this class +name. For example, if your table is named +EMPLOYEES+, you may want to +specify +\--class-name Employee+ instead. Similarly, you can specify +just the package name with +\--package-name+. The following import +generates a class named +com.foocorp.SomeTable+: + +---- +$ sqoop import --connect --table SomeTable --package-name com.foocorp +---- + +The +.java+ source file for your class will be written to the current +working directory when you run +sqoop+. You can control the output +directory with +\--outdir+. For example, +\--outdir src/generated/+. + +The import process compiles the source into +.class+ and +.jar+ files; +these are ordinarily stored under +/tmp+. You can select an alternate +target directory with +\--bindir+. For example, +\--bindir /scratch+. + +If you already have a compiled class that can be used to perform the +import and want to suppress the code-generation aspect of the import +process, you can use an existing jar and class by +providing the +\--jar-file+ and +\--class-name+ options. For example: + +---- +$ sqoop import --table SomeTable --jar-file mydatatypes.jar \ + --class-name SomeTableType +---- + +This command will load the +SomeTableType+ class out of +mydatatypes.jar+. + + +Example Invocations +~~~~~~~~~~~~~~~~~~~ + +The following examples illustrate how to use the import tool in a variety +of situations. + +A basic import of a table named +EMPLOYEES+ in the +corp+ database: + +---- +$ sqoop import --connect jdbc:mysql://db.foo.com/corp --table EMPLOYEES +---- + +A basic import requiring a login: + +---- +$ sqoop import --connect jdbc:mysql://db.foo.com/corp --table EMPLOYEES \ + --username SomeUser -P +Enter password: (hidden) +---- + +Selecting specific columns from the +EMPLOYEES+ table: + +---- +$ sqoop import --connect jdbc:mysql://db.foo.com/corp --table EMPLOYEES \ + --columns "employee_id,first_name,last_name,job_title" +---- + +Controlling the import parallelism (using 8 parallel tasks): + +---- +$ sqoop import --connect jdbc:mysql://db.foo.com/corp --table EMPLOYEES \ + -m 8 +---- + +Enabling the MySQL "direct mode" fast path: + +---- +$ sqoop import --connect jdbc:mysql://db.foo.com/corp --table EMPLOYEES \ + --direct +---- + +Storing data in SequenceFiles, and setting the generated class name to ++com.foocorp.Employee+: + +---- +$ sqoop import --connect jdbc:mysql://db.foo.com/corp --table EMPLOYEES \ + --class-name com.foocorp.Employee --as-sequencefile +---- + +Specifying the delimiters to use in a text-mode import: + +---- +$ sqoop import --connect jdbc:mysql://db.foo.com/corp --table EMPLOYEES \ + --fields-terminated-by '\t' --lines-terminated-by '\n' \ + --optionally-enclosed-by '\"' +---- + +Importing the data to Hive: + +---- +$ sqoop import --connect jdbc:mysql://db.foo.com/corp --table EMPLOYEES \ + --hive-import +---- + +Importing only new employees: + +---- +$ sqoop import --connect jdbc:mysql://db.foo.com/corp --table EMPLOYEES \ + --where "start_date > '2010-01-01'" +---- + +Changing the splitting column from the default: + +---- +$ sqoop import --connect jdbc:mysql://db.foo.com/corp --table EMPLOYEES \ + --split-by dept_id +---- + +Verifying that an import was successful: + +---- +$ hadoop fs -ls EMPLOYEES +Found 5 items +drwxr-xr-x - someuser somegrp 0 2010-04-27 16:40 /user/someuser/EMPLOYEES/_logs +-rw-r--r-- 1 someuser somegrp 2913511 2010-04-27 16:40 /user/someuser/EMPLOYEES/part-m-00000 +-rw-r--r-- 1 someuser somegrp 1683938 2010-04-27 16:40 /user/someuser/EMPLOYEES/part-m-00001 +-rw-r--r-- 1 someuser somegrp 7245839 2010-04-27 16:40 /user/someuser/EMPLOYEES/part-m-00002 +-rw-r--r-- 1 someuser somegrp 7842523 2010-04-27 16:40 /user/someuser/EMPLOYEES/part-m-00003 + +$ hadoop fs -cat EMPLOYEES/part-m-00000 | head -n 10 +0,joe,smith,engineering +1,jane,doe,marketing +... +---- + + diff --git a/src/docs/user/input-args.txt b/src/docs/user/input-args.txt new file mode 100644 index 00000000..43707fe8 --- /dev/null +++ b/src/docs/user/input-args.txt @@ -0,0 +1,34 @@ + +//// + Licensed to Cloudera, Inc. under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +//// + + +.Input parsing arguments: +[grid="all"] +`----------------------------------------`---------------------------------- +Argument Description +---------------------------------------------------------------------------- ++\--input-enclosed-by + Sets a required field encloser ++\--input-escaped-by + Sets the input escape \ + character ++\--input-fields-terminated-by + Sets the input field separator ++\--input-lines-terminated-by + Sets the input end-of-line \ + character ++\--input-optionally-enclosed-by + Sets a field enclosing \ + character +---------------------------------------------------------------------------- + diff --git a/src/docs/input-formatting-args.txt b/src/docs/user/input-formatting-args.txt similarity index 100% rename from src/docs/input-formatting-args.txt rename to src/docs/user/input-formatting-args.txt diff --git a/src/docs/input-formatting.txt b/src/docs/user/input-formatting.txt similarity index 100% rename from src/docs/input-formatting.txt rename to src/docs/user/input-formatting.txt diff --git a/src/docs/user/intro.txt b/src/docs/user/intro.txt new file mode 100644 index 00000000..8138a32b --- /dev/null +++ b/src/docs/user/intro.txt @@ -0,0 +1,45 @@ + +//// + Licensed to Cloudera, Inc. under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +//// + + +Introduction +------------ + +Sqoop is a tool designed to transfer data between Hadoop and +relational databases. You can use Sqoop to import data from a +relational database management system (RDBMS) such as MySQL or Oracle +into the Hadoop Distributed File System (HDFS), +transform the data in Hadoop MapReduce, and then export the data back +into an RDBMS. + +Sqoop automates most of this process, relying on the database to +describe the schema for the data to be imported. Sqoop uses MapReduce +to import and export the data, which provides parallel operation as +well as fault tolerance. + +This document describes how to get started using Sqoop to move data +between databases and Hadoop and provides reference information for +the operation of the Sqoop command-line tool suite. This document is +intended for: + +- System and application programmers +- System administrators +- Database administrators +- Data analysts +- Data engineers + diff --git a/src/docs/classnames.txt b/src/docs/user/list-databases.txt similarity index 50% rename from src/docs/classnames.txt rename to src/docs/user/list-databases.txt index 713e4cc4..24872f01 100644 --- a/src/docs/classnames.txt +++ b/src/docs/user/list-databases.txt @@ -17,27 +17,39 @@ //// -Generated Class Names -~~~~~~~~~~~~~~~~~~~~~ ++sqoop-list-databases+ +---------------------- -By default, classes are named after the table they represent. e.g., -+sqoop --table foo+ will generate a file named +foo.java+. You can -override the generated class name with the +--class-name+ argument. +Purpose +~~~~~~~ + +List database schemas present on a server. + +Syntax +~~~~~~ ---- -$ sqoop --connect jdbc:mysql://database.example.com/employees \ - --table employee_names --class-name com.example.EmployeeNames +$ sqoop list-databases (generic-args) (list-databases-args) +$ sqoop-list-databases (generic-args) (list-databases-args) ---- -_This generates a file named +com/example/EmployeeNames.java+_ -If you want to specify a package name for generated classes, but -still want them to be named after the table they represent, you -can instead use the argument +--package-name+: +Although the Hadoop generic arguments must preceed any list-databases +arguments, the list-databases arguments can be entered in any order +with respect to one another. + +include::common-args.txt[] + +Example Invocations +~~~~~~~~~~~~~~~~~~~ + +List database schemas available on a MySQL server: ---- -$ sqoop --connect jdbc:mysql://database.example.com/employees \ - --table employee_names --package-name com.example +$ sqoop list-databases --connect jdbc:mysql://database.example.com/ +information_schema +employees ---- -_This generates a file named +com/example/employee_names.java+_ +NOTE: This only works with HSQLDB and MySQL. A vendor-agnostic implementation +of this function has not yet been implemented. diff --git a/src/docs/user/list-tables.txt b/src/docs/user/list-tables.txt new file mode 100644 index 00000000..cdd84aec --- /dev/null +++ b/src/docs/user/list-tables.txt @@ -0,0 +1,54 @@ + +//// + Licensed to Cloudera, Inc. under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +//// + + ++sqoop-list-tables+ +------------------- + +Purpose +~~~~~~~ + +List tables present in a database. + +Syntax +~~~~~~ + +---- +$ sqoop list-tables (generic-args) (list-tables-args) +$ sqoop-list-tables (generic-args) (list-tables-args) +---- + +Although the Hadoop generic arguments must preceed any list-tables +arguments, the list-tables arguments can be entered in any order +with respect to one another. + +include::common-args.txt[] + +Example Invocations +~~~~~~~~~~~~~~~~~~~ + +List tables available in the "corp" database: + +---- +$ sqoop list-tables --connect jdbc:mysql://database.example.com/corp +employees +payroll_checks +job_descriptions +office_supplies +---- + diff --git a/src/docs/user/output-args.txt b/src/docs/user/output-args.txt new file mode 100644 index 00000000..b31d4465 --- /dev/null +++ b/src/docs/user/output-args.txt @@ -0,0 +1,35 @@ + +//// + Licensed to Cloudera, Inc. under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +//// + +.Output line formatting arguments: +[grid="all"] +`----------------------------------`---------------------------------- +Argument Description +---------------------------------------------------------------------- ++\--enclosed-by + Sets a required field enclosing \ + character ++\--escaped-by + Sets the escape character ++\--fields-terminated-by + Sets the field separator character ++\--lines-terminated-by + Sets the end-of-line character ++\--mysql-delimiters+ Uses MySQL's default delimiter set:\ + fields: +,+ lines: +\n+ \ + escaped-by: +\+ \ + optionally-enclosed-by: +'+ ++\--optionally-enclosed-by + Sets a field enclosing character +---------------------------------------------------------------------- + diff --git a/src/docs/output-formatting-args.txt b/src/docs/user/output-formatting-args.txt similarity index 100% rename from src/docs/output-formatting-args.txt rename to src/docs/user/output-formatting-args.txt diff --git a/src/docs/output-formatting.txt b/src/docs/user/output-formatting.txt similarity index 80% rename from src/docs/output-formatting.txt rename to src/docs/user/output-formatting.txt index 92a3bb15..1a939fd8 100644 --- a/src/docs/output-formatting.txt +++ b/src/docs/user/output-formatting.txt @@ -19,26 +19,26 @@ The delimiters used to separate fields and records can be specified on the command line, as can a quoting character and an escape character (for quoting delimiters inside a values). Data imported with -+--as-textfile+ will be formatted according to these parameters. Classes ++\--as-textfile+ will be formatted according to these parameters. Classes generated by Sqoop will encode this information, so using +toString()+ -from a data record stored +--as-sequencefile+ will reproduce your +from a data record stored +\--as-sequencefile+ will reproduce your specified formatting. The +(char)+ argument for each argument in this section can be specified -either as a normal character (e.g., +--fields-terminated-by ,+) or via +either as a normal character (e.g., +\--fields-terminated-by ,+) or via an escape sequence. Arguments of the form +\0xhhh+ will be interpreted as a hexidecimal representation of a character with hex number _hhh_. Arguments of the form +\0ooo+ will be treated as an octal representation of a character represented by octal number _ooo_. The special escapes +\n+, +\r+, +\"+, +\b+, +\t+, and +\\+ act as they do inside Java strings. +\0+ will be treated as NUL. This will insert NUL characters between fields or lines -(if used for +--fields-terminated-by+ or +--lines-terminated-by+), or will -disable enclosing/escaping if used for one of the +--enclosed-by+, -+--optionally-enclosed-by+, or +--escaped-by+ arguments. +(if used for +\--fields-terminated-by+ or +\--lines-terminated-by+), or will +disable enclosing/escaping if used for one of the +\--enclosed-by+, ++\--optionally-enclosed-by+, or +\--escaped-by+ arguments. The default delimiters are +,+ for fields, +\n+ for records, no quote character, and no escape character. Note that this can lead to ambiguous/unparsible records if you import database records containing commas or newlines in the field data. For unambiguous parsing, both must -be enabled, e.g., via +--mysql-delimiters+. +be enabled, e.g., via +\--mysql-delimiters+. diff --git a/src/docs/user/preface.txt b/src/docs/user/preface.txt new file mode 100644 index 00000000..6d956327 --- /dev/null +++ b/src/docs/user/preface.txt @@ -0,0 +1,61 @@ + +//// + Licensed to Cloudera, Inc. under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +//// + + + +Supported Releases +------------------ + +This documentation applies to Sqoop v1.0.0 (June 2010). + +Sqoop Releases +-------------- + +Sqoop is an open source software product of Cloudera, Inc. + +Software development for Sqoop occurs at http://github.com/cloudera/sqoop. +At that site you can obtain: +- New releases of Sqoop as well as its most recent source code +- An issue tracker +- A wiki that contains Sqoop documentation + +Sqoop is compatible with Apache Hadoop 0.21 and Cloudera's +Distribution of Hadoop version 3. + +Prerequisites +------------- + +The following prerequisite knowledge is required for this product: + +- Basic computer technology and terminology +- Familiarity with command-line interfaces such as +bash+ +- Relational database management systems +- Basic familiarity with the purpose and operation of Hadoop + +Before you can use Sqoop, a release of Hadoop must be installed and +configured. We recommend that you download Cloudera's Distribution +for Hadoop (CDH3) from the Cloudera Software Archive at +http://archive.cloudera.com for straightforward installation of Hadoop +on Linux systems. + +This document assumes you are using a Linux or Linux-like environment. +If you are using Windows, you may be able to use cygwin to accomplish +most of the following tasks. If you are using Mac OS X, you should see +few (if any) compatibility errors. Sqoop is predominantly operated and +tested on Linux. + diff --git a/src/docs/user/support.txt b/src/docs/user/support.txt new file mode 100644 index 00000000..36927b4c --- /dev/null +++ b/src/docs/user/support.txt @@ -0,0 +1,33 @@ + +//// + Licensed to Cloudera, Inc. under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +//// + +Getting Support +--------------- + +Report bugs in Sqoop to the issue tracker at +http://github.com/cloudera/sqoop/issues[]. + +For general questions and answers, a support forum is available at +http://getsatisfaction.com/cloudera/products/cloudera_sqoop[]. + +Before contacting either forum, run your Sqoop job with the ++\--verbose+ flag to acquire as much debugging information as +possible. Also report the string returned by +sqoop version+ as +well as the version of Hadoop you are running (+hadoop version+). + + diff --git a/src/docs/user/tools.txt b/src/docs/user/tools.txt new file mode 100644 index 00000000..cdcc2661 --- /dev/null +++ b/src/docs/user/tools.txt @@ -0,0 +1,168 @@ + +//// + Licensed to Cloudera, Inc. under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + Cloudera, Inc. licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +//// + +Sqoop Tools +----------- + +Sqoop is a collection of related tools. To use Sqoop, you specify the +tool you want to use and the arguments that control the tool. + +If Sqoop is compiled from its own source, you can run Sqoop without a formal +installation process by running the +bin/sqoop+ program. Users +of a packaged deployment of Sqoop (such as an RPM shipped with Cloudera's +Distribution for Hadoop) will see this program installed as +/usr/bin/sqoop+. +The remainder of this documentation will refer to this program as ++sqoop+. For example: + +---- +$ sqoop tool-name [tool-arguments] +---- + +NOTE: The following examples that begin with a +$+ character indicate +that the commands must be entered at a terminal prompt (such as ++bash+). The +$+ character represents the prompt itself; you should +not start these commands by typing a +$+. You can also enter commands +inline in the text of a paragraph; for example, +sqoop help+. These +examples do not show a +$+ prefix, but you should enter them the same +way. Don't confuse the +$+ shell prompt in the examples with the +$+ +that precedes an environment variable name. For example, the string +literal +$HADOOP_HOME+ includes a "+$+". + +Sqoop ships with a help tool. To display a list of all available +tools, type the following command: + +---- +$ sqoop help +usage: sqoop COMMAND [ARGS] + +Available commands: + codegen Generate code to interact with database records + create-hive-table Import a table definition into Hive + eval Evaluate a SQL statement and display the results + export Export an HDFS directory to a database table + help List available commands + import Import a table from a database to HDFS + import-all-tables Import tables from a database to HDFS + list-databases List available databases on a server + list-tables List available tables in a database + version Display version information + +See 'sqoop help COMMAND' for information on a specific command. +---- + +You can display help for a specific tool by entering: +sqoop help +(tool-name)+; for example, +sqoop help import+. + +You can also add the +\--help+ argument to any command: +sqoop import +\--help+. + +Using Command Aliases +~~~~~~~~~~~~~~~~~~~~~ + +In addition to typing the +sqoop (toolname)+ syntax, you can use alias +scripts that specify the +sqoop-(toolname)+ syntax. For example, the +scripts +sqoop-import+, +sqoop-export+, etc. each select a specific +tool. + +Controlling the Hadoop Installation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +You invoke Sqoop through the program launch capability provided by +Hadoop. The +sqoop+ command-line program is a wrapper which runs the ++bin/hadoop+ script shipped with Hadoop. If you have multiple +installations of Hadoop present on your machine, you can select the +Hadoop installation by setting the +$HADOOP_HOME+ environment +variable. + +For example: + +---- +$ HADOOP_HOME=/path/to/some/hadoop sqoop import --arguments... +---- + +or: + +---- +$ export HADOOP_HOME=/some/path/to/hadoop +$ sqoop import --arguments... +----- + +If +$HADOOP_HOME+ is not set, Sqoop will use the default installation +location for Cloudera's Distribution for Hadoop, +/usr/lib/hadoop+. + +The active Hadoop configuration is loaded from +$HADOOP_HOME/conf/+, +unless the +$HADOOP_CONF_DIR+ environment variable is set. + + +Using Generic and Specific Arguments +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To control the operation of each Sqoop tool, you use generic and +specific arguments. + +For example: + +---- +$ sqoop help import +usage: sqoop import [GENERIC-ARGS] [TOOL-ARGS] + +Common arguments: + --connect Specify JDBC connect string + --driver Manually specify JDBC driver class to use + --hadoop-home Override $HADOOP_HOME + --help Print usage instructions +-P Read password from console + --password Set authentication password + --username Set authentication username + --verbose Print more information while working + +[...] + +Generic Hadoop command-line arguments: +(must preceed any tool-specific arguments) +Generic options supported are +-conf specify an application configuration file +-D use value for given property +-fs specify a namenode +-jt specify a job tracker +-files specify comma separated files to be copied to the map reduce cluster +-libjars specify comma separated jar files to include in the classpath. +-archives specify comma separated archives to be unarchived on the compute machines. + +The general command line syntax is +bin/hadoop command [genericOptions] [commandOptions] +---- + +You must supply the generic arguments +-conf+, +-D+, and so on after the +tool name but *before* any tool-specific arguments (such as ++\--connect+). Note that generic Hadoop arguments are preceeded by a +single dash character (+-+), whereas tool-specific arguments start +with two dashes (+\--+), unless they are single character arguments such as +-P+. + +The +-conf+, +-D+, +-fs+ and +-jt+ arguments control the configuration +and Hadoop server settings. The +-files+, +-libjars+, and +-archives+ +arguments are not typically used with Sqoop, but they are included as +part of Hadoop's internal argument-parsing system. + + +Using Tools +~~~~~~~~~~~ + +The following sections will describe each tool's operation. The +tools are listed in the most likely order you will find them useful. + diff --git a/src/docs/listing-tables.txt b/src/docs/user/version.txt similarity index 67% rename from src/docs/listing-tables.txt rename to src/docs/user/version.txt index 852b9b98..077eb2f1 100644 --- a/src/docs/listing-tables.txt +++ b/src/docs/user/version.txt @@ -17,18 +17,32 @@ //// -Listing Available Tables -~~~~~~~~~~~~~~~~~~~~~~~~ ++sqoop-version+ +--------------- -Within a database, you can list the tables available for import with -the +--list-tables+ command. The following example shows four tables available -within the "employees" example database: +Purpose +~~~~~~~ + +Display version information for Sqoop. + +Syntax +~~~~~~ ---- -$ sqoop --connect jdbc:mysql://database.example.com/employees --list-tables -employee_names -payroll_checks -job_descriptions -office_supplies +$ sqoop version +$ sqoop-version +---- + + +Example Invocations +~~~~~~~~~~~~~~~~~~~ + +Display the version: + +---- +$ sqoop version +Sqoop 1.0.0 +git commit id 46b3e06b79a8411320d77c984c3030db47dd1c22 +Compiled by aaron@jargon on Mon May 17 13:43:22 PDT 2010 ---- diff --git a/src/java/org/apache/hadoop/sqoop/io/LobFile.java b/src/java/org/apache/hadoop/sqoop/io/LobFile.java index 6804f145..aceedf94 100644 --- a/src/java/org/apache/hadoop/sqoop/io/LobFile.java +++ b/src/java/org/apache/hadoop/sqoop/io/LobFile.java @@ -1092,7 +1092,11 @@ public abstract static class Reader implements Closeable { public abstract Path getPath(); /** - * Report the current position in the file. + * Report the current position in the file. Note that the internal + * cursor may move in an unpredictable fashion; e.g., to fetch + * additional data from the index stored at the end of the file. + * Clients may be more interested in the getRecordOffset() method + * which returns the starting offset of the current record. * @return the current offset from the start of the file in bytes. */ public abstract long tell() throws IOException; diff --git a/src/java/org/apache/hadoop/sqoop/tool/BaseSqoopTool.java b/src/java/org/apache/hadoop/sqoop/tool/BaseSqoopTool.java index 1b73b4ba..e0a90107 100644 --- a/src/java/org/apache/hadoop/sqoop/tool/BaseSqoopTool.java +++ b/src/java/org/apache/hadoop/sqoop/tool/BaseSqoopTool.java @@ -102,7 +102,7 @@ public abstract class BaseSqoopTool extends SqoopTool { public static final String PACKAGE_NAME_ARG = "package-name"; public static final String CLASS_NAME_ARG = "class-name"; public static final String JAR_FILE_NAME_ARG = "jar-file"; - public static final String DEBUG_SQL_ARG = "expr"; + public static final String DEBUG_SQL_ARG = "query"; public static final String DEBUG_SQL_SHORT_ARG = "e"; public static final String VERBOSE_ARG = "verbose"; public static final String HELP_ARG = "help"; @@ -399,7 +399,7 @@ protected RelatedOptions getCodeGenOpts(boolean multiTable) { if (!multiTable) { codeGenOpts.addOption(OptionBuilder.withArgName("name") .hasArg() - .withDescription("Sets the generated class name." + .withDescription("Sets the generated class name. " + "This overrides --" + PACKAGE_NAME_ARG + ". When combined " + "with --" + JAR_FILE_NAME_ARG + ", sets the input class.") .withLongOpt(CLASS_NAME_ARG)