From 6cbe7572e9f9894b6256307c537f11cc919e64bf Mon Sep 17 00:00:00 2001 From: Andrew Bayer Date: Fri, 22 Jul 2011 20:03:38 +0000 Subject: [PATCH] If --hive-import and --generate-only are specified, create a ddl script file. From: Aaron Kimball git-svn-id: https://svn.apache.org/repos/asf/incubator/sqoop/trunk@1149870 13f79535-47bb-0310-9956-ffa450edef68 --- .gitignore | 1 + build.xml | 6 +- src/docs/hive.txt | 6 ++ src/java/org/apache/hadoop/sqoop/Sqoop.java | 11 ++- .../apache/hadoop/sqoop/hive/HiveImport.java | 76 +++++++++++++------ .../hadoop/sqoop/hive/TestHiveImport.java | 44 ++++++++++- 6 files changed, 112 insertions(+), 32 deletions(-) diff --git a/.gitignore b/.gitignore index 620997a4..51d1309c 100644 --- a/.gitignore +++ b/.gitignore @@ -18,4 +18,5 @@ .project .launches .settings +/tags diff --git a/build.xml b/build.xml index 7ed8050d..587e0cd3 100644 --- a/build.xml +++ b/build.xml @@ -162,7 +162,7 @@ - + @@ -248,7 +248,7 @@ @@ -303,7 +303,7 @@ - Contrib Tests failed! + Unit tests failed! diff --git a/src/docs/hive.txt b/src/docs/hive.txt index c9b4a815..cd9f236e 100644 --- a/src/docs/hive.txt +++ b/src/docs/hive.txt @@ -54,6 +54,12 @@ The table name used in Hive is, by default, the same as that of the source table. You can control the output table name with the +--hive-table+ option. +If Hive import commands are used in conjunction with the +--generate-only+ +option, then a Hive import will not occur. Instead, the DDL commands to +perform the import from HDFS to Hive are written to a file named +_tableName_.q+ +which you can then execute with +hive -f+ after the data is brought into +HDFS. + Hive's Type System ~~~~~~~~~~~~~~~~~~ diff --git a/src/java/org/apache/hadoop/sqoop/Sqoop.java b/src/java/org/apache/hadoop/sqoop/Sqoop.java index 7dd4a388..074215f0 100644 --- a/src/java/org/apache/hadoop/sqoop/Sqoop.java +++ b/src/java/org/apache/hadoop/sqoop/Sqoop.java @@ -129,11 +129,14 @@ private void importTable(String tableName) throws IOException, ImportException { ImportJobContext context = new ImportJobContext(tableName, jarFile, options); manager.importTable(context); } + } - // If the user wants this table to be in Hive, perform that post-load. - if (options.doHiveImport()) { - hiveImport.importTable(tableName, options.getHiveTableName()); - } + // If the user wants this table to be in Hive, perform that post-load. + // If the user is in gen-only mode, this code will generate a Hive DDL + // statement and write it to a file, but will not actually perform the + // import. + if (options.doHiveImport()) { + hiveImport.importTable(tableName, options.getHiveTableName()); } } diff --git a/src/java/org/apache/hadoop/sqoop/hive/HiveImport.java b/src/java/org/apache/hadoop/sqoop/hive/HiveImport.java index ed2efcca..0fa6742e 100644 --- a/src/java/org/apache/hadoop/sqoop/hive/HiveImport.java +++ b/src/java/org/apache/hadoop/sqoop/hive/HiveImport.java @@ -104,6 +104,31 @@ private void removeTempLogs(String tableName) throws IOException { } } + /** + * @return true if we're just generating the DDL for the import, but + * not actually running it (i.e., --generate-only mode). If so, don't + * do any side-effecting actions in Hive. + */ + private boolean isGenerateOnly() { + return options.getAction() == SqoopOptions.ControlAction.GenerateOnly; + } + + /** + * @return a File object that can be used to write the DDL statement. + * If we're in gen-only mode, this should be a file in the outdir, named + * after the Hive table we're creating. If we're in import mode, this should + * be a one-off temporary file. + */ + private File getScriptFile(String outputTableName) throws IOException { + if (!isGenerateOnly()) { + return File.createTempFile("hive-script-",".txt", + new File(options.getTempDir())); + } else { + return new File(new File(options.getCodeOutputDir()), + outputTableName + ".q"); + } + } + /** * Perform the import of data from an HDFS path to a Hive table. * @@ -112,9 +137,11 @@ private void removeTempLogs(String tableName) throws IOException { */ public void importTable(String inputTableName, String outputTableName) throws IOException { - removeTempLogs(inputTableName); - LOG.info("Loading uploaded data into Hive"); + if (!isGenerateOnly()) { + removeTempLogs(inputTableName); + LOG.info("Loading uploaded data into Hive"); + } if (null == outputTableName) { outputTableName = inputTableName; @@ -142,12 +169,12 @@ public void importTable(String inputTableName, String outputTableName) String loadDataStmtStr = tableWriter.getLoadDataStmt() + ";\n"; // write them to a script file. - File tempFile = File.createTempFile("hive-script-",".txt", new File(options.getTempDir())); + File scriptFile = getScriptFile(outputTableName); try { - String tmpFilename = tempFile.toString(); + String filename = scriptFile.toString(); BufferedWriter w = null; try { - FileOutputStream fos = new FileOutputStream(tempFile); + FileOutputStream fos = new FileOutputStream(scriptFile); w = new BufferedWriter(new OutputStreamWriter(fos)); w.write(createTableStr, 0, createTableStr.length()); if (!options.doCreateHiveTableOnly()) { @@ -167,26 +194,31 @@ public void importTable(String inputTableName, String outputTableName) } } - // run Hive on the script and note the return code. - String hiveExec = getHiveBinPath(); - ArrayList args = new ArrayList(); - args.add(hiveExec); - args.add("-f"); - args.add(tmpFilename); + if (!isGenerateOnly()) { + // run Hive on the script and note the return code. + String hiveExec = getHiveBinPath(); + ArrayList args = new ArrayList(); + args.add(hiveExec); + args.add("-f"); + args.add(filename); - LoggingAsyncSink logSink = new LoggingAsyncSink(LOG); - int ret = Executor.exec(args.toArray(new String[0]), - env.toArray(new String[0]), logSink, logSink); - if (0 != ret) { - throw new IOException("Hive exited with status " + ret); + LoggingAsyncSink logSink = new LoggingAsyncSink(LOG); + int ret = Executor.exec(args.toArray(new String[0]), + env.toArray(new String[0]), logSink, logSink); + if (0 != ret) { + throw new IOException("Hive exited with status " + ret); + } + + LOG.info("Hive import complete."); } - - LOG.info("Hive import complete."); } finally { - if (!tempFile.delete()) { - LOG.warn("Could not remove temporary file: " + tempFile.toString()); - // try to delete the file later. - tempFile.deleteOnExit(); + if (!isGenerateOnly()) { + // User isn't interested in saving the DDL. Remove the file. + if (!scriptFile.delete()) { + LOG.warn("Could not remove temporary file: " + scriptFile.toString()); + // try to delete the file later. + scriptFile.deleteOnExit(); + } } } } diff --git a/src/test/org/apache/hadoop/sqoop/hive/TestHiveImport.java b/src/test/org/apache/hadoop/sqoop/hive/TestHiveImport.java index 01628360..5a850b35 100644 --- a/src/test/org/apache/hadoop/sqoop/hive/TestHiveImport.java +++ b/src/test/org/apache/hadoop/sqoop/hive/TestHiveImport.java @@ -25,8 +25,9 @@ import org.apache.commons.logging.LogFactory; import org.junit.Test; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; - import org.apache.hadoop.sqoop.SqoopOptions; import org.apache.hadoop.sqoop.testutil.CommonArgs; import org.apache.hadoop.sqoop.testutil.HsqldbTestServer; @@ -57,8 +58,11 @@ public class TestHiveImport extends ImportJobTestCase { args.add("--connect"); args.add(HsqldbTestServer.getUrl()); args.add("--hive-import"); - args.add("--split-by"); - args.add(getColNames()[0]); + String [] colNames = getColNames(); + if (null != colNames) { + args.add("--split-by"); + args.add(colNames[0]); + } args.add("--num-mappers"); args.add("1"); @@ -101,6 +105,40 @@ private void runImportTest(String tableName, String [] types, String [] values, runImport(getArgv(true, extraArgs)); } + /** Test that we can generate a file containing the DDL and not import. */ + @Test + public void testGenerateOnly() throws IOException { + final String TABLE_NAME = "GenerateOnly"; + String [] extraArgs = { "--generate-only" }; + + // Figure out where our target generated .q file is going to be. + SqoopOptions options = getSqoopOptions(extraArgs); + Path ddlFile = new Path(new Path(options.getCodeOutputDir()), + TABLE_NAME + ".q"); + FileSystem fs = FileSystem.getLocal(new Configuration()); + + // If it's already there, remove it before running the test to ensure + // that it's the current test that generated the file. + if (fs.exists(ddlFile)) { + if (!fs.delete(ddlFile, false)) { + LOG.warn("Could not delete previous ddl file: " + ddlFile); + } + } + + // Run a basic import, but specify that we're just generating definitions. + String [] types = { "INTEGER" }; + String [] vals = { "42" }; + runImportTest(TABLE_NAME, types, vals, null, extraArgs); + + // Test that the generated definition file exists. + assertTrue("Couldn't find expected ddl file", fs.exists(ddlFile)); + + Path hiveImportPath = new Path(new Path(options.getWarehouseDir()), + TABLE_NAME); + assertFalse("Import actually happened!", fs.exists(hiveImportPath)); + } + + /** Test that strings and ints are handled in the normal fashion */ @Test public void testNormalHiveImport() throws IOException {