diff --git a/src/docs/user/hive-args.txt b/src/docs/user/hive-args.txt index daa5b0d1..c5af987d 100644 --- a/src/docs/user/hive-args.txt +++ b/src/docs/user/hive-args.txt @@ -30,5 +30,9 @@ Argument Description to Hive. +\--hive-drop-import-delims+ Drops '\n', '\r', and '\01' from string\ fields when importing to Hive. ++\--hive-partition-key+ Name of a hive field to partition are \ + sharded on ++\--hive-partition-value + String-value that serves as partition key\ + for this imported into hive in this job. -------------------------------------------------------------------------- diff --git a/src/docs/user/hive.txt b/src/docs/user/hive.txt index 8894434d..83744672 100644 --- a/src/docs/user/hive.txt +++ b/src/docs/user/hive.txt @@ -68,3 +68,10 @@ with Hive's defaults. The table name used in Hive is, by default, the same as that of the source table. You can control the output table name with the +\--hive-table+ option. + +Hive can put data into partitions for more efficient query +performance. You can tell a Sqoop job to import data for Hive into a +particular partition by specifying the +\--hive-partition-key+ and ++\--hive-partition-value+ arguments. The partition value must be a +string. Please see the Hive documentation for more details on +partitioning. diff --git a/src/java/com/cloudera/sqoop/SqoopOptions.java b/src/java/com/cloudera/sqoop/SqoopOptions.java index f9c79239..8c525baf 100644 --- a/src/java/com/cloudera/sqoop/SqoopOptions.java +++ b/src/java/com/cloudera/sqoop/SqoopOptions.java @@ -147,6 +147,8 @@ public enum IncrementalMode { @StoredAsProperty("hive.overwrite.table") private boolean overwriteHiveTable; @StoredAsProperty("hive.table.name") private String hiveTableName; @StoredAsProperty("hive.drop.delims") private boolean hiveDropDelims; + @StoredAsProperty("hive.partition.key") private String hivePartitionKey; + @StoredAsProperty("hive.partition.value") private String hivePartitionValue; // An ordered list of column names denoting what order columns are // serialized to a PreparedStatement from a generated record type. @@ -1343,6 +1345,22 @@ public void setHiveTableName(String name) { this.hiveTableName = name; } + public String getHivePartitionKey() { + return hivePartitionKey; + } + + public void setHivePartitionKey(String hpk) { + this.hivePartitionKey = hpk; + } + + public String getHivePartitionValue() { + return hivePartitionValue; + } + + public void setHivePartitionValue(String hpv) { + this.hivePartitionValue = hpv; + } + /** * @return the file size to split by when using --direct mode. */ diff --git a/src/java/com/cloudera/sqoop/hive/TableDefWriter.java b/src/java/com/cloudera/sqoop/hive/TableDefWriter.java index 6db2afc5..5f889fbd 100644 --- a/src/java/com/cloudera/sqoop/hive/TableDefWriter.java +++ b/src/java/com/cloudera/sqoop/hive/TableDefWriter.java @@ -167,6 +167,12 @@ public String getCreateTableStmt() throws IOException { sb.append("COMMENT 'Imported by sqoop on " + curDateStr + "' "); } + if (options.getHivePartitionKey() != null) { + sb.append("PARTITIONED BY (") + .append(options.getHivePartitionKey()) + .append(" STRING) "); + } + sb.append("ROW FORMAT DELIMITED FIELDS TERMINATED BY '"); sb.append(getHiveOctalCharCode((int) options.getOutputFieldDelim())); sb.append("' LINES TERMINATED BY '"); @@ -208,6 +214,13 @@ public String getLoadDataStmt() throws IOException { sb.append(outputTableName); sb.append('`'); + if (options.getHivePartitionKey() != null) { + sb.append(" PARTITION (") + .append(options.getHivePartitionKey()) + .append("='").append(options.getHivePartitionValue()) + .append("')"); + } + LOG.debug("Load statement: " + sb.toString()); return sb.toString(); } diff --git a/src/java/com/cloudera/sqoop/tool/BaseSqoopTool.java b/src/java/com/cloudera/sqoop/tool/BaseSqoopTool.java index 42e55520..b7812f51 100644 --- a/src/java/com/cloudera/sqoop/tool/BaseSqoopTool.java +++ b/src/java/com/cloudera/sqoop/tool/BaseSqoopTool.java @@ -90,6 +90,8 @@ public abstract class BaseSqoopTool extends SqoopTool { public static final String HIVE_TABLE_ARG = "hive-table"; public static final String HIVE_OVERWRITE_ARG = "hive-overwrite"; public static final String HIVE_DROP_DELIMS_ARG = "hive-drop-import-delims"; + public static final String HIVE_PARTITION_KEY_ARG = "hive-partition-key"; + public static final String HIVE_PARTITION_VALUE_ARG = "hive-partition-value"; public static final String NUM_MAPPERS_ARG = "num-mappers"; public static final String NUM_MAPPERS_SHORT_ARG = "m"; public static final String COMPRESS_ARG = "compress"; @@ -405,6 +407,17 @@ protected RelatedOptions getHiveOptions(boolean explicitHiveImport) { + "(\\n\\r) from imported string fields") .withLongOpt(HIVE_DROP_DELIMS_ARG) .create()); + hiveOpts.addOption(OptionBuilder.withArgName("partition-key") + .hasArg() + .withDescription("Sets the partition key to use when importing to hive") + .withLongOpt(HIVE_PARTITION_KEY_ARG) + .create()); + hiveOpts.addOption(OptionBuilder.withArgName("partition-value") + .hasArg() + .withDescription("Sets the partition value to use when importing " + + "to hive") + .withLongOpt(HIVE_PARTITION_VALUE_ARG) + .create()); return hiveOpts; } @@ -662,6 +675,14 @@ protected void applyHiveOptions(CommandLine in, SqoopOptions out) if (in.hasOption(HIVE_DROP_DELIMS_ARG)) { out.setHiveDropDelims(true); } + + if (in.hasOption(HIVE_PARTITION_KEY_ARG)) { + out.setHivePartitionKey(in.getOptionValue(HIVE_PARTITION_KEY_ARG)); + } + + if (in.hasOption(HIVE_PARTITION_VALUE_ARG)) { + out.setHivePartitionValue(in.getOptionValue(HIVE_PARTITION_VALUE_ARG)); + } } protected void applyOutputFormatOptions(CommandLine in, SqoopOptions out) diff --git a/src/test/com/cloudera/sqoop/TestSqoopOptions.java b/src/test/com/cloudera/sqoop/TestSqoopOptions.java index 0546094b..376b4524 100644 --- a/src/test/com/cloudera/sqoop/TestSqoopOptions.java +++ b/src/test/com/cloudera/sqoop/TestSqoopOptions.java @@ -236,6 +236,16 @@ public void testGoodNumMappers() throws Exception { assertEquals(4, opts.getNumMappers()); } + public void testHivePartitionParams() throws Exception { + String[] args = { + "--hive-partition-key", "ds", + "--hive-partition-value", "20110413", + }; + SqoopOptions opts = parse(args); + assertEquals("ds", opts.getHivePartitionKey()); + assertEquals("20110413", opts.getHivePartitionValue()); + } + public void testPropertySerialization1() { // Test that if we write a SqoopOptions out to a Properties, // and then read it back in, we get all the same results. diff --git a/src/test/com/cloudera/sqoop/hive/TestHiveImport.java b/src/test/com/cloudera/sqoop/hive/TestHiveImport.java index 1ffd24f2..375d4d77 100644 --- a/src/test/com/cloudera/sqoop/hive/TestHiveImport.java +++ b/src/test/com/cloudera/sqoop/hive/TestHiveImport.java @@ -359,4 +359,24 @@ public void testFieldWithHiveDelims() throws IOException, } } + /** + * Test hive import with row that has new line in it. + */ + @Test + public void testImportHiveWithPartitions() throws IOException, + InterruptedException { + final String TABLE_NAME = "PARTITION_HIVE_IMPORT"; + + LOG.info("Doing import of single row into PARTITION_HIVE_IMPORT table"); + setCurTableName(TABLE_NAME); + setNumCols(3); + String[] types = { "VARCHAR(32)", "INTEGER", "CHAR(64)", }; + String[] vals = { "'whoop'", "42", "'I am a row in a partition'", }; + String[] moreArgs = { "--" + BaseSqoopTool.HIVE_PARTITION_KEY_ARG, "ds", + "--" + BaseSqoopTool.HIVE_PARTITION_VALUE_ARG, "20110413", }; + + runImportTest(TABLE_NAME, types, vals, "partitionImport.q", + getArgv(false, moreArgs), new ImportTool()); + } + } diff --git a/src/test/com/cloudera/sqoop/hive/TestTableDefWriter.java b/src/test/com/cloudera/sqoop/hive/TestTableDefWriter.java index f6614b5a..43b755eb 100644 --- a/src/test/com/cloudera/sqoop/hive/TestTableDefWriter.java +++ b/src/test/com/cloudera/sqoop/hive/TestTableDefWriter.java @@ -21,12 +21,14 @@ import java.util.HashMap; import java.util.Map; +import junit.framework.TestCase; + import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; -import com.cloudera.sqoop.SqoopOptions; -import junit.framework.TestCase; +import com.cloudera.sqoop.SqoopOptions; +import com.cloudera.sqoop.tool.ImportTool; /** * Test Hive DDL statement generation. @@ -78,4 +80,30 @@ public void testDifferentTableNames() throws Exception { assertTrue(loadData.indexOf("INTO TABLE `outputTable`") != -1); assertTrue(loadData.indexOf("/inputTable'") != -1); } + + public void testPartitions() throws Exception { + String[] args = { + "--hive-partition-key", "ds", + "--hive-partition-value", "20110413", + }; + Configuration conf = new Configuration(); + SqoopOptions options = + new ImportTool().parseArguments(args, null, null, false); + TableDefWriter writer = new TableDefWriter(options, + null, "inputTable", "outputTable", conf, false); + + Map colTypes = new HashMap(); + writer.setColumnTypes(colTypes); + + String createTable = writer.getCreateTableStmt(); + String loadData = writer.getLoadDataStmt(); + + assertNotNull(createTable); + assertNotNull(loadData); + assertEquals("CREATE TABLE IF NOT EXISTS `outputTable` ( ) " + + "PARTITIONED BY (ds STRING) " + + "ROW FORMAT DELIMITED FIELDS TERMINATED BY '\\054' " + + "LINES TERMINATED BY '\\012' STORED AS TEXTFILE", createTable); + assertTrue(loadData.endsWith(" PARTITION (ds='20110413')")); + } } diff --git a/testdata/hive/scripts/partitionImport.q b/testdata/hive/scripts/partitionImport.q new file mode 100644 index 00000000..aa489691 --- /dev/null +++ b/testdata/hive/scripts/partitionImport.q @@ -0,0 +1,2 @@ +CREATE TABLE IF NOT EXISTS `PARTITION_HIVE_IMPORT` ( `DATA_COL0` STRING, `DATA_COL1` INT, `DATA_COL2` STRING) PARTITIONED BY (ds STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\001' LINES TERMINATED BY '\012' STORED AS TEXTFILE; +LOAD DATA INPATH 'file:BASEPATH/sqoop/warehouse/PARTITION_HIVE_IMPORT' INTO TABLE `PARTITION_HIVE_IMPORT` PARTITION (ds='20110413');