5
0
mirror of https://github.com/apache/sqoop.git synced 2025-05-03 04:11:44 +08:00

SQOOP-100. Sqoop to support populating Hive table partitions.

Initial patch by Frank Maritato.

From: Jonathan Hsieh <jon@cloudera.com>

git-svn-id: https://svn.apache.org/repos/asf/incubator/sqoop/trunk@1150031 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Andrew Bayer 2011-07-22 20:04:34 +00:00
parent 0a795d32e4
commit 0efc5a4d55
9 changed files with 125 additions and 2 deletions

View File

@ -30,5 +30,9 @@ Argument Description
to Hive.
+\--hive-drop-import-delims+ Drops '\n', '\r', and '\01' from string\
fields when importing to Hive.
+\--hive-partition-key+ Name of a hive field to partition are \
sharded on
+\--hive-partition-value <v>+ String-value that serves as partition key\
for this imported into hive in this job.
--------------------------------------------------------------------------

View File

@ -68,3 +68,10 @@ with Hive's defaults.
The table name used in Hive is, by default, the same as that of the
source table. You can control the output table name with the +\--hive-table+
option.
Hive can put data into partitions for more efficient query
performance. You can tell a Sqoop job to import data for Hive into a
particular partition by specifying the +\--hive-partition-key+ and
+\--hive-partition-value+ arguments. The partition value must be a
string. Please see the Hive documentation for more details on
partitioning.

View File

@ -147,6 +147,8 @@ public enum IncrementalMode {
@StoredAsProperty("hive.overwrite.table") private boolean overwriteHiveTable;
@StoredAsProperty("hive.table.name") private String hiveTableName;
@StoredAsProperty("hive.drop.delims") private boolean hiveDropDelims;
@StoredAsProperty("hive.partition.key") private String hivePartitionKey;
@StoredAsProperty("hive.partition.value") private String hivePartitionValue;
// An ordered list of column names denoting what order columns are
// serialized to a PreparedStatement from a generated record type.
@ -1343,6 +1345,22 @@ public void setHiveTableName(String name) {
this.hiveTableName = name;
}
public String getHivePartitionKey() {
return hivePartitionKey;
}
public void setHivePartitionKey(String hpk) {
this.hivePartitionKey = hpk;
}
public String getHivePartitionValue() {
return hivePartitionValue;
}
public void setHivePartitionValue(String hpv) {
this.hivePartitionValue = hpv;
}
/**
* @return the file size to split by when using --direct mode.
*/

View File

@ -167,6 +167,12 @@ public String getCreateTableStmt() throws IOException {
sb.append("COMMENT 'Imported by sqoop on " + curDateStr + "' ");
}
if (options.getHivePartitionKey() != null) {
sb.append("PARTITIONED BY (")
.append(options.getHivePartitionKey())
.append(" STRING) ");
}
sb.append("ROW FORMAT DELIMITED FIELDS TERMINATED BY '");
sb.append(getHiveOctalCharCode((int) options.getOutputFieldDelim()));
sb.append("' LINES TERMINATED BY '");
@ -208,6 +214,13 @@ public String getLoadDataStmt() throws IOException {
sb.append(outputTableName);
sb.append('`');
if (options.getHivePartitionKey() != null) {
sb.append(" PARTITION (")
.append(options.getHivePartitionKey())
.append("='").append(options.getHivePartitionValue())
.append("')");
}
LOG.debug("Load statement: " + sb.toString());
return sb.toString();
}

View File

@ -90,6 +90,8 @@ public abstract class BaseSqoopTool extends SqoopTool {
public static final String HIVE_TABLE_ARG = "hive-table";
public static final String HIVE_OVERWRITE_ARG = "hive-overwrite";
public static final String HIVE_DROP_DELIMS_ARG = "hive-drop-import-delims";
public static final String HIVE_PARTITION_KEY_ARG = "hive-partition-key";
public static final String HIVE_PARTITION_VALUE_ARG = "hive-partition-value";
public static final String NUM_MAPPERS_ARG = "num-mappers";
public static final String NUM_MAPPERS_SHORT_ARG = "m";
public static final String COMPRESS_ARG = "compress";
@ -405,6 +407,17 @@ protected RelatedOptions getHiveOptions(boolean explicitHiveImport) {
+ "(\\n\\r) from imported string fields")
.withLongOpt(HIVE_DROP_DELIMS_ARG)
.create());
hiveOpts.addOption(OptionBuilder.withArgName("partition-key")
.hasArg()
.withDescription("Sets the partition key to use when importing to hive")
.withLongOpt(HIVE_PARTITION_KEY_ARG)
.create());
hiveOpts.addOption(OptionBuilder.withArgName("partition-value")
.hasArg()
.withDescription("Sets the partition value to use when importing "
+ "to hive")
.withLongOpt(HIVE_PARTITION_VALUE_ARG)
.create());
return hiveOpts;
}
@ -662,6 +675,14 @@ protected void applyHiveOptions(CommandLine in, SqoopOptions out)
if (in.hasOption(HIVE_DROP_DELIMS_ARG)) {
out.setHiveDropDelims(true);
}
if (in.hasOption(HIVE_PARTITION_KEY_ARG)) {
out.setHivePartitionKey(in.getOptionValue(HIVE_PARTITION_KEY_ARG));
}
if (in.hasOption(HIVE_PARTITION_VALUE_ARG)) {
out.setHivePartitionValue(in.getOptionValue(HIVE_PARTITION_VALUE_ARG));
}
}
protected void applyOutputFormatOptions(CommandLine in, SqoopOptions out)

View File

@ -236,6 +236,16 @@ public void testGoodNumMappers() throws Exception {
assertEquals(4, opts.getNumMappers());
}
public void testHivePartitionParams() throws Exception {
String[] args = {
"--hive-partition-key", "ds",
"--hive-partition-value", "20110413",
};
SqoopOptions opts = parse(args);
assertEquals("ds", opts.getHivePartitionKey());
assertEquals("20110413", opts.getHivePartitionValue());
}
public void testPropertySerialization1() {
// Test that if we write a SqoopOptions out to a Properties,
// and then read it back in, we get all the same results.

View File

@ -359,4 +359,24 @@ public void testFieldWithHiveDelims() throws IOException,
}
}
/**
* Test hive import with row that has new line in it.
*/
@Test
public void testImportHiveWithPartitions() throws IOException,
InterruptedException {
final String TABLE_NAME = "PARTITION_HIVE_IMPORT";
LOG.info("Doing import of single row into PARTITION_HIVE_IMPORT table");
setCurTableName(TABLE_NAME);
setNumCols(3);
String[] types = { "VARCHAR(32)", "INTEGER", "CHAR(64)", };
String[] vals = { "'whoop'", "42", "'I am a row in a partition'", };
String[] moreArgs = { "--" + BaseSqoopTool.HIVE_PARTITION_KEY_ARG, "ds",
"--" + BaseSqoopTool.HIVE_PARTITION_VALUE_ARG, "20110413", };
runImportTest(TABLE_NAME, types, vals, "partitionImport.q",
getArgv(false, moreArgs), new ImportTool());
}
}

View File

@ -21,12 +21,14 @@
import java.util.HashMap;
import java.util.Map;
import junit.framework.TestCase;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import com.cloudera.sqoop.SqoopOptions;
import junit.framework.TestCase;
import com.cloudera.sqoop.SqoopOptions;
import com.cloudera.sqoop.tool.ImportTool;
/**
* Test Hive DDL statement generation.
@ -78,4 +80,30 @@ public void testDifferentTableNames() throws Exception {
assertTrue(loadData.indexOf("INTO TABLE `outputTable`") != -1);
assertTrue(loadData.indexOf("/inputTable'") != -1);
}
public void testPartitions() throws Exception {
String[] args = {
"--hive-partition-key", "ds",
"--hive-partition-value", "20110413",
};
Configuration conf = new Configuration();
SqoopOptions options =
new ImportTool().parseArguments(args, null, null, false);
TableDefWriter writer = new TableDefWriter(options,
null, "inputTable", "outputTable", conf, false);
Map<String, Integer> colTypes = new HashMap<String, Integer>();
writer.setColumnTypes(colTypes);
String createTable = writer.getCreateTableStmt();
String loadData = writer.getLoadDataStmt();
assertNotNull(createTable);
assertNotNull(loadData);
assertEquals("CREATE TABLE IF NOT EXISTS `outputTable` ( ) "
+ "PARTITIONED BY (ds STRING) "
+ "ROW FORMAT DELIMITED FIELDS TERMINATED BY '\\054' "
+ "LINES TERMINATED BY '\\012' STORED AS TEXTFILE", createTable);
assertTrue(loadData.endsWith(" PARTITION (ds='20110413')"));
}
}

View File

@ -0,0 +1,2 @@
CREATE TABLE IF NOT EXISTS `PARTITION_HIVE_IMPORT` ( `DATA_COL0` STRING, `DATA_COL1` INT, `DATA_COL2` STRING) PARTITIONED BY (ds STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\001' LINES TERMINATED BY '\012' STORED AS TEXTFILE;
LOAD DATA INPATH 'file:BASEPATH/sqoop/warehouse/PARTITION_HIVE_IMPORT' INTO TABLE `PARTITION_HIVE_IMPORT` PARTITION (ds='20110413');