diff --git a/src/docs/user/hive-args.txt b/src/docs/user/hive-args.txt index 7e6b7a01..4d154360 100644 --- a/src/docs/user/hive-args.txt +++ b/src/docs/user/hive-args.txt @@ -32,6 +32,8 @@ Argument Description to Hive. +\--hive-drop-import-delims+ Drops '\n', '\r', and '\01' from string\ fields when importing to Hive. ++\--hive-delims-replacement+ Replace '\n', '\r', and '\01' from string\ + fields with user defined string when importing to Hive. +\--hive-partition-key+ Name of a hive field to partition are \ sharded on +\--hive-partition-value + String-value that serves as partition key\ diff --git a/src/docs/user/hive.txt b/src/docs/user/hive.txt index 2699d3ea..4d2db331 100644 --- a/src/docs/user/hive.txt +++ b/src/docs/user/hive.txt @@ -58,8 +58,11 @@ rows contain string fields that have Hive's default row delimiters (+\n+ and +\r+ characters) or column delimiters (+\01+ characters) present in them. You can use the +\--hive-drop-import-delims+ option to drop those characters on import to give Hive-compatible text data. -This option should only be used if you use Hive's default delimiters -and should not be used if different delimiters are specified. +Alternatively, you can use the +\--hive-delims-replacement+ option +to replace those characters with a user-defined string on import to give +Hive-compatible text data. These options should only be used if you use +Hive's default delimiters and should not be used if different delimiters +are specified. Sqoop will pass the field and record delimiters through to Hive. If you do not set any delimiters and do use +\--hive-import+, the field delimiter will diff --git a/src/java/com/cloudera/sqoop/SqoopOptions.java b/src/java/com/cloudera/sqoop/SqoopOptions.java index 52ece1db..d07aecc6 100644 --- a/src/java/com/cloudera/sqoop/SqoopOptions.java +++ b/src/java/com/cloudera/sqoop/SqoopOptions.java @@ -152,6 +152,8 @@ public enum IncrementalMode { private boolean failIfHiveTableExists; @StoredAsProperty("hive.table.name") private String hiveTableName; @StoredAsProperty("hive.drop.delims") private boolean hiveDropDelims; + @StoredAsProperty("hive.delims.replacement") + private String hiveDelimsReplacement; @StoredAsProperty("hive.partition.key") private String hivePartitionKey; @StoredAsProperty("hive.partition.value") private String hivePartitionValue; @@ -1100,6 +1102,18 @@ public void setHiveDropDelims(boolean dropHiveDelims) { this.hiveDropDelims = dropHiveDelims; } + /** + * @return the user-specified option to specify the replacement string + * for hive delimeters + */ + public String getHiveDelimsReplacement() { + return hiveDelimsReplacement; + } + + public void setHiveDelimsReplacement(String replacement) { + this.hiveDelimsReplacement = replacement; + } + /** * @return the user-specified option to specify sqoop's behavior during * target table creation if the table exists. diff --git a/src/java/com/cloudera/sqoop/lib/FieldFormatter.java b/src/java/com/cloudera/sqoop/lib/FieldFormatter.java index 41536e1d..dfd35cab 100644 --- a/src/java/com/cloudera/sqoop/lib/FieldFormatter.java +++ b/src/java/com/cloudera/sqoop/lib/FieldFormatter.java @@ -32,8 +32,20 @@ private FieldFormatter() { } * @return */ public static String hiveStringDropDelims(String str, + DelimiterSet delimiters) { + return hiveStringReplaceDelims(str, "", delimiters); + } + + /** + * replace hive delimiters with a user-defined string passed to the + * --hive-delims-replacement option. + * @param str + * @param delimiters + * @return + */ + public static String hiveStringReplaceDelims(String str, String replacement, DelimiterSet delimiters) { - String droppedDelims = str.replaceAll("\\n|\\r|\01", ""); + String droppedDelims = str.replaceAll("\\n|\\r|\01", replacement); return escapeAndEnclose(droppedDelims, delimiters); } diff --git a/src/java/com/cloudera/sqoop/orm/ClassWriter.java b/src/java/com/cloudera/sqoop/orm/ClassWriter.java index dd3994e5..f4c6b1ef 100644 --- a/src/java/com/cloudera/sqoop/orm/ClassWriter.java +++ b/src/java/com/cloudera/sqoop/orm/ClassWriter.java @@ -826,10 +826,18 @@ private void generateToString(Map columnTypes, } if (javaType.equals("String") && options.doHiveDropDelims()) { - sb.append(" // special case for strings hive, dropping delimiters " - + "\\n,\\r,\\01 from strings\n"); - sb.append(" __sb.append(FieldFormatter.hiveStringDropDelims(" - + stringExpr + ", delimiters));\n"); + sb.append(" // special case for strings hive, dropping" + + "delimiters \\n,\\r,\\01 from strings\n"); + sb.append(" __sb.append(FieldFormatter.hiveStringDropDelims(" + + stringExpr + ", delimiters));\n"); + } else if (javaType.equals("String") + && options.getHiveDelimsReplacement() != null) { + sb.append(" // special case for strings hive, replacing " + + "delimiters \\n,\\r,\\01 with '" + + options.getHiveDelimsReplacement() + "' from strings\n"); + sb.append(" __sb.append(FieldFormatter.hiveStringReplaceDelims(" + + stringExpr + ", \"" + options.getHiveDelimsReplacement() + "\", " + + "delimiters));\n"); } else { sb.append(" __sb.append(FieldFormatter.escapeAndEnclose(" + stringExpr + ", delimiters));\n"); diff --git a/src/java/com/cloudera/sqoop/tool/BaseSqoopTool.java b/src/java/com/cloudera/sqoop/tool/BaseSqoopTool.java index c307907f..879c7c8c 100644 --- a/src/java/com/cloudera/sqoop/tool/BaseSqoopTool.java +++ b/src/java/com/cloudera/sqoop/tool/BaseSqoopTool.java @@ -98,6 +98,8 @@ public abstract class BaseSqoopTool extends SqoopTool { public static final String HIVE_TABLE_ARG = "hive-table"; public static final String HIVE_OVERWRITE_ARG = "hive-overwrite"; public static final String HIVE_DROP_DELIMS_ARG = "hive-drop-import-delims"; + public static final String HIVE_DELIMS_REPLACEMENT_ARG = + "hive-delims-replacement"; public static final String HIVE_PARTITION_KEY_ARG = "hive-partition-key"; public static final String HIVE_PARTITION_VALUE_ARG = "hive-partition-value"; public static final String CREATE_HIVE_TABLE_ARG = @@ -426,6 +428,12 @@ protected RelatedOptions getHiveOptions(boolean explicitHiveImport) { + "(\\n\\r) from imported string fields") .withLongOpt(HIVE_DROP_DELIMS_ARG) .create()); + hiveOpts.addOption(OptionBuilder + .hasArg() + .withDescription("Replace Hive record \\0x01 and row delimiters " + + "(\\n\\r) from imported string fields with user-defined string") + .withLongOpt(HIVE_DELIMS_REPLACEMENT_ARG) + .create()); hiveOpts.addOption(OptionBuilder.withArgName("partition-key") .hasArg() .withDescription("Sets the partition key to use when importing to hive") @@ -729,6 +737,11 @@ protected void applyHiveOptions(CommandLine in, SqoopOptions out) out.setHiveDropDelims(true); } + if (in.hasOption(HIVE_DELIMS_REPLACEMENT_ARG)) { + out.setHiveDelimsReplacement( + in.getOptionValue(HIVE_DELIMS_REPLACEMENT_ARG)); + } + if (in.hasOption(HIVE_PARTITION_KEY_ARG)) { out.setHivePartitionKey(in.getOptionValue(HIVE_PARTITION_KEY_ARG)); } @@ -894,6 +907,12 @@ protected void validateHiveOptions(SqoopOptions options) throws InvalidOptionsException { // Empty; this method is present to maintain API consistency, and // is reserved for future constraints on Hive options. + if (options.getHiveDelimsReplacement() != null + && options.doHiveDropDelims()) { + throw new InvalidOptionsException("The " + HIVE_DROP_DELIMS_ARG + + " option conflicts with the " + HIVE_DELIMS_REPLACEMENT_ARG + + " option." + HELP_STR); + } } protected void validateHBaseOptions(SqoopOptions options) diff --git a/src/java/com/cloudera/sqoop/tool/ImportTool.java b/src/java/com/cloudera/sqoop/tool/ImportTool.java index 66e60bda..a9edf8fb 100644 --- a/src/java/com/cloudera/sqoop/tool/ImportTool.java +++ b/src/java/com/cloudera/sqoop/tool/ImportTool.java @@ -839,6 +839,7 @@ public void validateOptions(SqoopOptions options) validateCodeGenOptions(options); validateOutputFormatOptions(options); validateHBaseOptions(options); + validateHiveOptions(options); } } diff --git a/src/test/com/cloudera/sqoop/hive/TestHiveImport.java b/src/test/com/cloudera/sqoop/hive/TestHiveImport.java index 35de2fdc..4d43bc73 100644 --- a/src/test/com/cloudera/sqoop/hive/TestHiveImport.java +++ b/src/test/com/cloudera/sqoop/hive/TestHiveImport.java @@ -32,6 +32,7 @@ import org.junit.Test; import com.cloudera.sqoop.SqoopOptions; +import com.cloudera.sqoop.SqoopOptions.InvalidOptionsException; import com.cloudera.sqoop.testutil.CommonArgs; import com.cloudera.sqoop.testutil.HsqldbTestServer; import com.cloudera.sqoop.testutil.ImportJobTestCase; @@ -40,6 +41,7 @@ import com.cloudera.sqoop.tool.CreateHiveTableTool; import com.cloudera.sqoop.tool.ImportTool; import com.cloudera.sqoop.tool.SqoopTool; +import org.apache.commons.cli.ParseException; /** * Test HiveImport capability after an import to HDFS. @@ -359,6 +361,77 @@ public void testFieldWithHiveDelims() throws IOException, } } + /** + * Test hive import with row that has new line in it. + */ + @Test + public void testFieldWithHiveDelimsReplacement() throws IOException, + InterruptedException { + final String TABLE_NAME = "FIELD_WITH_NL_REPLACEMENT_HIVE_IMPORT"; + + LOG.info("Doing import of single row into " + + "FIELD_WITH_NL_REPLACEMENT_HIVE_IMPORT table"); + setCurTableName(TABLE_NAME); + setNumCols(3); + String[] types = { "VARCHAR(32)", "INTEGER", "CHAR(64)" }; + String[] vals = { "'test with\nnew lines\n'", "42", + "'oh no " + '\01' + " field delims " + '\01' + "'", }; + String[] moreArgs = { "--"+BaseSqoopTool.HIVE_DELIMS_REPLACEMENT_ARG, " "}; + + runImportTest(TABLE_NAME, types, vals, + "fieldWithNewlineReplacementImport.q", getArgv(false, moreArgs), + new ImportTool()); + + LOG.info("Validating data in single row is present in: " + + "FIELD_WITH_NL_REPLACEMENT_HIVE_IMPORT table"); + + // Ideally, we would actually invoke hive code to verify that record with + // record and field delimiters have values replaced and that we have the + // proper number of hive records. Unfortunately, this is a non-trivial task, + // and better dealt with at an integration test level + // + // Instead, this assumes the path of the generated table and just validate + // map job output. + + // Get and read the raw output file + String whDir = getWarehouseDir(); + File p = new File(new File(whDir, TABLE_NAME), "part-m-00000"); + File f = new File(p.toString()); + FileReader fr = new FileReader(f); + BufferedReader br = new BufferedReader(fr); + try { + // verify the output + assertEquals(br.readLine(), "test with new lines " + '\01' + "42" + + '\01' + "oh no field delims "); + assertEquals(br.readLine(), null); // should only be one line + } catch (IOException ioe) { + fail("Unable to read files generated from hive"); + } finally { + br.close(); + } + } + + /** + * Test hive drop and replace option validation. + */ + @Test + public void testHiveDropAndReplaceOptionValidation() throws ParseException { + LOG.info("Testing conflicting Hive delimiter drop/replace options"); + + setNumCols(3); + String[] moreArgs = { "--"+BaseSqoopTool.HIVE_DELIMS_REPLACEMENT_ARG, " ", + "--"+BaseSqoopTool.HIVE_DROP_DELIMS_ARG, }; + + ImportTool tool = new ImportTool(); + try { + tool.validateOptions(tool.parseArguments(getArgv(false, moreArgs), null, + null, true)); + fail("Expected InvalidOptionsException"); + } catch (InvalidOptionsException ex) { + /* success */ + } + } + /** * Test hive import with row that has new line in it. */ diff --git a/testdata/hive/scripts/fieldWithNewlineReplacementImport.q b/testdata/hive/scripts/fieldWithNewlineReplacementImport.q new file mode 100644 index 00000000..9d604b32 --- /dev/null +++ b/testdata/hive/scripts/fieldWithNewlineReplacementImport.q @@ -0,0 +1,2 @@ +CREATE TABLE IF NOT EXISTS `FIELD_WITH_NL_REPLACEMENT_HIVE_IMPORT` ( `DATA_COL0` STRING, `DATA_COL1` INT, `DATA_COL2` STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\001' LINES TERMINATED BY '\012' STORED AS TEXTFILE; +LOAD DATA INPATH 'file:BASEPATH/sqoop/warehouse/FIELD_WITH_NL_REPLACEMENT_HIVE_IMPORT' INTO TABLE `FIELD_WITH_NL_REPLACEMENT_HIVE_IMPORT`;