From 47535ee3dee24170db3303b71855456a8d142711 Mon Sep 17 00:00:00 2001 From: Fero Szabo Date: Tue, 11 Dec 2018 14:31:34 +0100 Subject: [PATCH] various fixes and reformats --- src/docs/user/hive.txt | 47 +++++++++++++++++++---------------- src/docs/user/import.txt | 53 ++++++++++++++++++++-------------------- 2 files changed, 52 insertions(+), 48 deletions(-) diff --git a/src/docs/user/hive.txt b/src/docs/user/hive.txt index 498602c2..979e7afc 100644 --- a/src/docs/user/hive.txt +++ b/src/docs/user/hive.txt @@ -116,10 +116,10 @@ External table import +++++++++++++++++++++ You can specify the +\--external-table-dir+ option in the sqoop command to -work with an external Hive table (instead of a managed table, i.e. the default behavior). -To import data into an external table, one has to specify +\--hive-import+ in the command -line arguments. Table creation is also supported with the use of +\--create-hive-table+ -option. +work with an external Hive table (instead of a managed table, i.e. the default +behavior). To import data into an external table, one has to specify the ++\--hive-import+ option in the command line arguments. Table creation is +also supported with the use of the +\--create-hive-table+ option. Importing into an external Hive table: ---- @@ -131,28 +131,33 @@ Create an external Hive table: $ sqoop import --hive-import --create-hive-table --connect $CONN --table $TABLENAME --username $USER --password $PASS --external-table-dir /tmp/foobar_example --hive-table foobar ---- -Decimals in Hive imports using parquet files -++++++++++++++++++++++++++++++++++++++++++++ +Decimals in Hive import using parquet file +++++++++++++++++++++++++++++++++++++++++++ As mentioned above, a Hive import is a two-step process in Sqoop: -first, the data is imported onto HDFS, then a statement is generated and executed to create a Hive table. +first, the data is imported onto HDFS, then a HQL statement is generated and +executed to create the Hive table. -Since Sqoop is using an avro schema to write parquet files, first an Avro schema is generated from the SQL types. -This schema is then used in a regular Parquet import. After the data was imported onto HDFS successfully, -Sqoop uses the Avro schema to create a Hive command to create a table in Hive and maps the Avro types to Hive -types in this process. +During the first step, an Avro schema is generated from the SQL data types. +This schema is then used in a regular Parquet import. After the data was +imported onto HDFS successfully, Sqoop takes the Avro schema, maps the Avro +types to Hive types and to generates the HQL statement to create the table. -Decimal SQL types are converted to Strings in a parquet import per default, so Decimal columns appear as String -columns in Hive per default. You can change this behavior and use logical types instead, so that Decimals -will be properly mapped to the Hive type Decimal as well. This has to be enabled with the -+sqoop.parquet.logical_types.decimal.enable+ property. As noted in the section discussing -'Padding number types in avro and parquet import', you should also specify the default precision and scale and -enable padding. +Decimal SQL types are converted to Strings in a parquet import per default, +so Decimal columns appear as String columns in Hive per default. You can change +this behavior by enabling logical types for parquet, so that Decimals will be +properly mapped to the Hive type Decimal as well. This can be done with the ++sqoop.parquet.logical_types.decimal.enable+ property. As noted in the section +discussing 'Enabling Logical Types in Avro and Parquet import for numbers', +you should also specify the default precision and scale and enable padding. -A limitation of Hive is that the maximum precision and scale is 38. When converting SQL types to the Hive Decimal -type, precision and scale will be modified to meet this limitation, automatically. The data itself however, will -only have to adhere to the limitations of the Parquet file format, thus values with a precision and scale bigger than -38 will be present on storage, but they won't be readable by Hive, (since Hive is a schema-on-read tool). +A limitation of Hive is that the maximum precision and scale is 38. When +converting to the Hive Decimal type, precision and scale will be reduced +if necessary to meet this limitation, automatically. The data itself however, +will only have to adhere to the limitations of the Avro schema, thus values +with a precision and scale bigger than 38 are allowed and will be present on +storage, but they won't be readable by Hive, (since Hive is a +schema-on-read tool). Enabling padding and specifying a default precision and scale in a Hive Import: ---- diff --git a/src/docs/user/import.txt b/src/docs/user/import.txt index d878e216..701f6704 100644 --- a/src/docs/user/import.txt +++ b/src/docs/user/import.txt @@ -472,46 +472,48 @@ Enabling Logical Types in Avro and Parquet import for numbers ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ To enable the use of logical types in Sqoop's avro schema generation, -i.e. used during both avro and parquet imports, one has to use the -sqoop.avro.logical_types.decimal.enable flag. This is necessary if one +i.e. used both during avro and parquet imports, one has to use the ++sqoop.avro.logical_types.decimal.enable+ property. This is necessary if one wants to store values as decimals in the avro file format. +In case of a parquet import, one has to use the ++sqoop.parquet.logical_types.decimal.enable+ property. + Padding number types in avro and parquet import -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ++++++++++++++++++++++++++++++++++++++++++++++++ Certain databases, such as Oracle and Postgres store number and decimal values without padding. For example 1.5 in a column declared as NUMBER (20, 5) is stored as is in Oracle, while the equivalent DECIMAL (20, 5) is stored as 1.50000 in an SQL server instance. -This leads to a scale mismatch during avro import. +This leads to a scale mismatch during the import. -To avoid this error, one can use the sqoop.avro.decimal_padding.enable flag -to turn on padding with 0s during. One also has to enable logical types with the -sqoop.avro.logical_types.decimal.enable property set to true during an avro import, -or with the sqoop.parquet.logical_types.decimal.enable property during a parquet import. +To avoid this error, one can use the +sqoop.avro.decimal_padding.enable+ +property to turn on padding with 0s during. Naturally, this property is used +together with logical types enabled, either in avro or in parquet import. Default precision and scale in avro and parquet import -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +++++++++++++++++++++++++++++++++++++++++++++++++++++++ All of the databases allow users to specify numeric columns without a precision or scale. While MS SQL and MySQL translate these into -a valid precision and scale values, Oracle and Postgres don't. +valid precision and scale, Oracle and Postgres don't. -When a table contains NUMBER in a table in Oracle or -NUMERIC/DECIMAL in Postgres, one can specify a default precision and scale -to be used in the avro schema by using the +sqoop.avro.logical_types.decimal.default.precision+ +When a table contains a NUMBER column in Oracle or NUMERIC/DECIMAL in +Postgres, one can specify a default precision and scale to be used in the +avro schema by using the +sqoop.avro.logical_types.decimal.default.precision+ and +sqoop.avro.logical_types.decimal.default.scale+ properties. Avro padding also has to be enabled, if the values are shorter than -the specified default scale. +the specified default scale, together with logical types. -Even though their name contains 'avro', the very same properties -(+sqoop.avro.logical_types.decimal.default.precision+ and +sqoop.avro.logical_types.decimal.default.scale+) +Even though the name of the properties contain 'avro', the very same properties +(+sqoop.avro.logical_types.decimal.default.precision+ and ++sqoop.avro.logical_types.decimal.default.scale+) can be used to specify defaults during a parquet import as well. -But please not that the padding has to be enabled with the parquet specific property. -The implementation of the padding logic is database independent. -Our tests only cover only Oracle, Postgres, MS Sql server and MySQL databases, -therefore these are the supported ones. +The implementation of this logic and the padding is database independent. +However, our tests cover Oracle, Postgres, MS Sql server and MySQL databases +only, therefore these are the supported ones. Large Objects ^^^^^^^^^^^^^ @@ -848,30 +850,27 @@ $ sqoop import --connect jdbc:mysql://db.foo.com/corp \ ---- Enabling logical types in avro import and also turning on padding with 0s: - ---- $ sqoop import -Dsqoop.avro.decimal_padding.enable=true -Dsqoop.avro.logical_types.decimal.enable=true - --connect $CON --username $USER --password $PASS --query "select * from table_name where \$CONDITIONS" + --connect $MYCONN --username $MYUSER --password $MYPASS --query "select * from table_name where \$CONDITIONS" --target-dir hdfs://nameservice1//etl/target_path --as-avrodatafile --verbose -m 1 ---- Enabling logical types in avro import and also turning on padding with 0s, while specifying default precision and scale as well: - ---- $ sqoop import -Dsqoop.avro.decimal_padding.enable=true -Dsqoop.avro.logical_types.decimal.enable=true -Dsqoop.avro.logical_types.decimal.default.precision=38 -Dsqoop.avro.logical_types.decimal.default.scale=10 - --connect $CON --username $USER --password $PASS --query "select * from table_name where \$CONDITIONS" + --connect $MYCONN --username $MYUSER --password $MYPASS --query "select * from table_name where \$CONDITIONS" --target-dir hdfs://nameservice1//etl/target_path --as-avrodatafile --verbose -m 1 ---- -The same in a parquet import: - +Enabling logical types in parquet import and also turning on padding with 0s, while specifying default precision and scale as well: ---- $ sqoop import -Dsqoop.parquet.decimal_padding.enable=true -Dsqoop.avro.logical_types.decimal.enable=true -Dsqoop.avro.logical_types.decimal.default.precision=38 -Dsqoop.avro.logical_types.decimal.default.scale=10 - --connect $CON --username $USER --password $PASS --query "select * from table_name where \$CONDITIONS" + --connect $MYCONN --username $MYUSER --password $MYPASS --query "select * from table_name where \$CONDITIONS" --target-dir hdfs://nameservice1//etl/target_path --as-parquetfile --verbose -m 1 ----