From 70caf779b0188f4d94bffeb0bd85cfde1e14a489 Mon Sep 17 00:00:00 2001 From: Andrew Bayer Date: Fri, 22 Jul 2011 20:04:26 +0000 Subject: [PATCH] SQOOP-142. Document requirements for direct import Updated the documentation with details on direct mode execution requirements. From: Arvind Prabhakar git-svn-id: https://svn.apache.org/repos/asf/incubator/sqoop/trunk@1150005 13f79535-47bb-0310-9956-ffa450edef68 --- src/docs/man/import-args.txt | 12 +++++-- src/docs/man/sqoop-export.txt | 6 +++- src/docs/man/sqoop-import.txt | 65 +++++------------------------------ src/docs/user/export.txt | 3 ++ src/docs/user/import.txt | 7 +++- 5 files changed, 32 insertions(+), 61 deletions(-) diff --git a/src/docs/man/import-args.txt b/src/docs/man/import-args.txt index 538e9136..d5ba3f0c 100644 --- a/src/docs/man/import-args.txt +++ b/src/docs/man/import-args.txt @@ -33,7 +33,7 @@ Import control options Columns to export from table --direct:: - Use direct import fast path (mysql only) + Use direct import fast path (MySQL and PostgreSQL) --direct-split-size (n):: Split the input stream every 'n' bytes when importing in direct mode. @@ -41,8 +41,8 @@ Import control options --inline-lob-limit (n):: Set the maximum size for an inline LOB --m:: --num-mappers (n):: +-m:: Use 'n' map tasks to import in parallel --query (statement):: @@ -63,8 +63,14 @@ Import control options --where (clause):: Import only the rows for which _clause_ is true. e.g.: `--where "user_id > 400 AND hidden == 0"` - + --compress:: -z:: Uses gzip to compress data as it is written to HDFS +--null-string:: + The string to be written for a null value for string columns + +--null-non-string:: + The string to be written for a null value for non-string columns + diff --git a/src/docs/man/sqoop-export.txt b/src/docs/man/sqoop-export.txt index 17129dea..6090ea15 100644 --- a/src/docs/man/sqoop-export.txt +++ b/src/docs/man/sqoop-export.txt @@ -28,7 +28,7 @@ Export control options ~~~~~~~~~~~~~~~~~~~~~~ --direct:: - Use direct import fast path (mysql only) + Use direct import fast path (MySQL) --export-dir (dir):: HDFS source path for the export @@ -70,6 +70,10 @@ after a lone '--' on the command-line. In MySQL direct mode, additional arguments are passed directly to mysqldump. +Note: When using MySQL direct mode, the MySQL bulk utilities ++mysqldump+ and +mysqlimport+ should be available on the task nodes and +present in the shell path of the task process. + ENVIRONMENT ----------- diff --git a/src/docs/man/sqoop-import.txt b/src/docs/man/sqoop-import.txt index 735738f9..d63a3a18 100644 --- a/src/docs/man/sqoop-import.txt +++ b/src/docs/man/sqoop-import.txt @@ -24,62 +24,7 @@ The +--connect+ and +--table+ options are required. include::common-args.txt[] -Import control options -~~~~~~~~~~~~~~~~~~~~~~ - ---append:: - Append data to an existing HDFS dataset - ---as-sequencefile:: - Imports data to SequenceFiles - ---as-textfile:: - Imports data as plain text (default) - ---columns (col,col,col...):: - Columns to export from table - ---direct:: - Use direct import fast path (mysql only) - ---direct-split-size (n):: - Split the input stream every 'n' bytes when importing in direct mode. - ---inline-lob-limit (n):: - Set the maximum size for an inline LOB - ---num-mappers (n):: --m:: - Use 'n' map tasks to import in parallel - ---query (statement):: - Imports the results of +statement+ instead of a table - ---split-by (column-name):: - Column of the table used to split the table for parallel import - ---table (table-name):: - The table to import - ---target-dir (dir):: - Explicit HDFS target directory for the import. - ---warehouse-dir (dir):: - Tables are uploaded to the HDFS path +/warehouse/dir/(tablename)/+ - ---where (clause):: - Import only the rows for which _clause_ is true. - e.g.: `--where "user_id > 400 AND hidden == 0"` - ---compress:: --z:: - Uses gzip to compress data as it is written to HDFS - ---null-string:: - The string to be written for a null value for string columns - ---null-non-string:: - The string to be written for a null value for non-string columns +include::import-args.txt[] include::output-args.txt[] @@ -126,6 +71,14 @@ after a lone '--' on the command-line. In MySQL direct mode, additional arguments are passed directly to mysqldump. +Note: When using MySQL direct mode, the MySQL bulk utilities ++mysqldump+ and +mysqlimport+ should be available on the task nodes and +present in the shell path of the task process. + +Note: When using PostgreSQL direct mode, the PostgreSQL client utility ++psql+ should be available on the task nodes and present in the shell path +of the task process. + ENVIRONMENT ----------- diff --git a/src/docs/user/export.txt b/src/docs/user/export.txt index b30b0b5b..4f878865 100644 --- a/src/docs/user/export.txt +++ b/src/docs/user/export.txt @@ -83,6 +83,9 @@ MySQL provides a direct mode for exports as well, using the to specify this codepath. This may be higher-performance than the standard JDBC codepath. +NOTE: When using export in direct mode with MySQL, the MySQL bulk utility ++mysqlimport+ must be available in the shell path of the task process. + The +\--input-null-string+ and +\--input-null-non-string+ arguments are optional. If +\--input-null-string+ is not specified, then the string "null" will be interpreted as null for string-type columns. diff --git a/src/docs/user/import.txt b/src/docs/user/import.txt index a3605710..a5c6610b 100644 --- a/src/docs/user/import.txt +++ b/src/docs/user/import.txt @@ -33,7 +33,7 @@ $ sqoop import (generic-args) (import-args) $ sqoop-import (generic-args) (import-args) ---- -While the Hadoop generic arguments must preceed any import arguments, +While the Hadoop generic arguments must precede any import arguments, you can type the import arguments in any order with respect to one another. @@ -246,6 +246,11 @@ data to a temporary directory and then rename the files into the normal target directory in a manner that does not conflict with existing filenames in that directory. +NOTE: When using the direct mode of import, certain database client utilities +are expected to be present in the shell path of the task process. For MySQL +the utilities +mysqldump+ and +mysqlimport+ are required, whereas for +PostgreSQL the utility +psql+ is required. + Incremental Imports ^^^^^^^^^^^^^^^^^^^