mirror of
https://github.com/apache/sqoop.git
synced 2025-05-03 14:11:49 +08:00
Changes for Postgresql Copy Export:
1. Support for empty null string --null-string '' 2. Support for non-xml delim --fields-terminated-by'\0x1c' 3. Added line buffering perf --batch 4. optional TEXT mode instead of CSV -Dpostgresql.format.text=true 5. Support for postgres Version 8 -Dpostgresql.targetdb.ver=8 6. Optional disable escape sequences -Dpostgresql.input.israw=true
This commit is contained in:
parent
032b828370
commit
123641f3a9
@ -79,16 +79,46 @@ protected void configureMapper(Job job, String tableName,
|
|||||||
job.setMapOutputKeyClass(NullWritable.class);
|
job.setMapOutputKeyClass(NullWritable.class);
|
||||||
job.setMapOutputValueClass(NullWritable.class);
|
job.setMapOutputValueClass(NullWritable.class);
|
||||||
}
|
}
|
||||||
|
/* list of chars that cannot be passed via jobconf */
|
||||||
|
final static String badXmlString = "\u0000\u0001\u0002\u0003\u0004\u0005" +
|
||||||
|
"\u0006\u0007\u0008\u000B\u000C\u000E\u000F\u0010\u0011\u0012" +
|
||||||
|
"\u0013\u0014\u0015\u0016\u0017\u0018\u0019\u001A\u001B\u001C" +
|
||||||
|
"\u001D\u001E\u001F\uFFFE\uFFFF";
|
||||||
|
|
||||||
|
/* true if the char is ok to pass via Configuration */
|
||||||
|
public static boolean validXml(char s){
|
||||||
|
return (badXmlString.indexOf(s)<0);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
protected void propagateOptionsToJob(Job job) {
|
protected void propagateOptionsToJob(Job job) {
|
||||||
super.propagateOptionsToJob(job);
|
super.propagateOptionsToJob(job);
|
||||||
SqoopOptions opts = context.getOptions();
|
SqoopOptions opts = context.getOptions();
|
||||||
Configuration conf = job.getConfiguration();
|
Configuration conf = job.getConfiguration();
|
||||||
if (opts.getNullStringValue() != null) {
|
|
||||||
conf.set("postgresql.null.string", opts.getNullStringValue());
|
/* empty string needs to be passed as a flag */
|
||||||
|
if (opts.getNullStringValue().equals("")) {
|
||||||
|
conf.set("postgresql.null.emptystring","true");
|
||||||
}
|
}
|
||||||
setDelimiter("postgresql.input.field.delim",
|
|
||||||
opts.getInputFieldDelim(), conf);
|
/* valid delimiters may not be valid xml chars, so the hadoop conf will fail.
|
||||||
|
* but we still want to support them so we base64 encode it in that case
|
||||||
|
* */
|
||||||
|
char delim= opts.getInputFieldDelim();
|
||||||
|
String delimString=Character.toString(delim);
|
||||||
|
if(validXml(delim)){
|
||||||
|
setDelimiter("postgresql.input.field.delim",delim,conf);
|
||||||
|
}else{
|
||||||
|
conf.set("postgresql.input.field.delim.base64",
|
||||||
|
java.util.Base64.getEncoder().encodeToString(delimString.getBytes()));
|
||||||
|
}
|
||||||
|
|
||||||
|
/* use the --batch switch to enable line buffering */
|
||||||
|
if (opts.isBatchMode()){
|
||||||
|
conf.set("postgresql.export.batchmode","true");
|
||||||
|
}
|
||||||
|
|
||||||
|
/* todo: there may still be some case where user wants an invalid xml char for record delim */
|
||||||
setDelimiter("postgresql.input.record.delim",
|
setDelimiter("postgresql.input.record.delim",
|
||||||
opts.getInputRecordDelim(), conf);
|
opts.getInputRecordDelim(), conf);
|
||||||
setDelimiter("postgresql.input.enclosedby",
|
setDelimiter("postgresql.input.enclosedby",
|
||||||
|
@ -36,6 +36,7 @@
|
|||||||
import org.postgresql.PGConnection;
|
import org.postgresql.PGConnection;
|
||||||
import org.postgresql.copy.CopyManager;
|
import org.postgresql.copy.CopyManager;
|
||||||
import org.postgresql.copy.CopyIn;
|
import org.postgresql.copy.CopyIn;
|
||||||
|
import org.apache.sqoop.util.LineBuffer;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -52,6 +53,10 @@ public class PostgreSQLCopyExportMapper
|
|||||||
public static final Log LOG =
|
public static final Log LOG =
|
||||||
LogFactory.getLog(PostgreSQLCopyExportMapper.class.getName());
|
LogFactory.getLog(PostgreSQLCopyExportMapper.class.getName());
|
||||||
|
|
||||||
|
private boolean bufferMode=false; /* whether or not to use the line buffer */
|
||||||
|
private LineBuffer lineBuffer; /* batch up the lines before sending to copy */
|
||||||
|
private boolean isRaw=false; /* if isRaw then we won't interprete escapes */
|
||||||
|
|
||||||
private Configuration conf;
|
private Configuration conf;
|
||||||
private DBConfiguration dbConf;
|
private DBConfiguration dbConf;
|
||||||
private Connection conn = null;
|
private Connection conn = null;
|
||||||
@ -63,6 +68,17 @@ public class PostgreSQLCopyExportMapper
|
|||||||
|
|
||||||
public PostgreSQLCopyExportMapper() {
|
public PostgreSQLCopyExportMapper() {
|
||||||
}
|
}
|
||||||
|
/* Text mode normally interprets escape sequences. Optionally
|
||||||
|
* turn that off by escaping the escapes
|
||||||
|
* */
|
||||||
|
public String fixEscapes(String s){
|
||||||
|
if (isRaw){
|
||||||
|
return s.replace("\\","\\\\");
|
||||||
|
} else {
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected void setup(Context context)
|
protected void setup(Context context)
|
||||||
@ -71,6 +87,7 @@ protected void setup(Context context)
|
|||||||
super.setup(context);
|
super.setup(context);
|
||||||
conf = context.getConfiguration();
|
conf = context.getConfiguration();
|
||||||
dbConf = new DBConfiguration(conf);
|
dbConf = new DBConfiguration(conf);
|
||||||
|
lineBuffer=new LineBuffer();
|
||||||
CopyManager cm = null;
|
CopyManager cm = null;
|
||||||
try {
|
try {
|
||||||
conn = dbConf.getConnection();
|
conn = dbConf.getConnection();
|
||||||
@ -83,69 +100,141 @@ protected void setup(Context context)
|
|||||||
throw new IOException(ex);
|
throw new IOException(ex);
|
||||||
}
|
}
|
||||||
try {
|
try {
|
||||||
StringBuilder sql = new StringBuilder();
|
/* Set if buffering mode is requested */
|
||||||
sql.append("COPY ");
|
this.bufferMode=("true".equals(conf.get("postgresql.export.batchmode")));
|
||||||
sql.append(dbConf.getOutputTableName());
|
|
||||||
sql.append(" FROM STDIN WITH (");
|
/* isRaw means escapes are NOT to be interpreted */
|
||||||
sql.append(" ENCODING 'UTF-8' ");
|
this.isRaw=("true".equals(conf.get("postgresql.input.israw")));
|
||||||
sql.append(", FORMAT csv ");
|
|
||||||
sql.append(", DELIMITER ");
|
/* add support for delims which are not valid xml. We have base64 encoded them */
|
||||||
sql.append("'");
|
String delimBase64=conf.get("postgresql.input.field.delim.base64");
|
||||||
sql.append(conf.get("postgresql.input.field.delim", ","));
|
String delim=null;
|
||||||
sql.append("'");
|
if (delimBase64!=null){
|
||||||
sql.append(", QUOTE ");
|
delim=new String(java.util.Base64.getDecoder().decode(delimBase64));
|
||||||
sql.append("'");
|
} else {
|
||||||
sql.append(conf.get("postgresql.input.enclosedby", "\""));
|
delim=conf.get("postgresql.input.field.delim",",");
|
||||||
sql.append("'");
|
}
|
||||||
sql.append(", ESCAPE ");
|
|
||||||
sql.append("'");
|
/* Some postgres instances out there still using version 8.x */
|
||||||
sql.append(conf.get("postgresql.input.escapedby", "\""));
|
StringBuilder sql = new StringBuilder();
|
||||||
sql.append("'");
|
String ver=conf.get("postgresql.targetdb.ver", "9");
|
||||||
if (conf.get("postgresql.null.string") != null) {
|
if (ver.equals("8")){
|
||||||
sql.append(", NULL ");
|
sql.append("COPY ");
|
||||||
sql.append("'");
|
sql.append(dbConf.getOutputTableName());
|
||||||
sql.append(conf.get("postgresql.null.string"));
|
sql.append(" FROM STDIN WITH ");
|
||||||
sql.append("'");
|
sql.append(" DELIMITER ");
|
||||||
}
|
sql.append("'");
|
||||||
sql.append(")");
|
sql.append(delim);
|
||||||
|
sql.append("'");
|
||||||
|
if (! "true".equals(conf.get("postgresql.format.text"))){
|
||||||
|
sql.append(" CSV ");
|
||||||
|
sql.append(" QUOTE ");
|
||||||
|
sql.append("'");
|
||||||
|
sql.append(conf.get("postgresql.input.enclosedby", "\""));
|
||||||
|
sql.append("'");
|
||||||
|
sql.append(" ESCAPE ");
|
||||||
|
sql.append("'");
|
||||||
|
sql.append(conf.get("postgresql.input.escapedby", "\""));
|
||||||
|
sql.append("'");
|
||||||
|
}
|
||||||
|
/* Hadoop config does not permit empty string so we use special switch to designate that */
|
||||||
|
if (conf.get("postgresql.null.emptystring")!=null){
|
||||||
|
sql.append(" NULL ''");
|
||||||
|
}else
|
||||||
|
if (conf.get("postgresql.null.string") != null) {
|
||||||
|
sql.append(" NULL ");
|
||||||
|
sql.append("'");
|
||||||
|
sql.append(conf.get("postgresql.null.string"));
|
||||||
|
sql.append("'");
|
||||||
|
}
|
||||||
|
} else { /* intended for version 9.x This has not been fixed for buffering */
|
||||||
|
sql.append("COPY ");
|
||||||
|
sql.append(dbConf.getOutputTableName());
|
||||||
|
sql.append(" FROM STDIN WITH (");
|
||||||
|
sql.append(" ENCODING 'UTF-8' ");
|
||||||
|
sql.append(", FORMAT csv ");
|
||||||
|
sql.append(", DELIMITER ");
|
||||||
|
sql.append("'");
|
||||||
|
sql.append(conf.get("postgresql.input.field.delim", ","));
|
||||||
|
sql.append("'");
|
||||||
|
sql.append(", QUOTE ");
|
||||||
|
sql.append("'");
|
||||||
|
sql.append(conf.get("postgresql.input.enclosedby", "\""));
|
||||||
|
sql.append("'");
|
||||||
|
sql.append(", ESCAPE ");
|
||||||
|
sql.append("'");
|
||||||
|
sql.append(conf.get("postgresql.input.escapedby", "\""));
|
||||||
|
sql.append("'");
|
||||||
|
if (conf.get("postgresql.null.string") != null) {
|
||||||
|
sql.append(", NULL ");
|
||||||
|
sql.append("'");
|
||||||
|
sql.append(conf.get("postgresql.null.string"));
|
||||||
|
sql.append("'");
|
||||||
|
}
|
||||||
|
sql.append(")");
|
||||||
|
}
|
||||||
LOG.debug("Starting export with copy: " + sql);
|
LOG.debug("Starting export with copy: " + sql);
|
||||||
copyin = cm.copyIn(sql.toString());
|
copyin = cm.copyIn(sql.toString());
|
||||||
} catch (SQLException ex) {
|
} catch (SQLException ex) {
|
||||||
LoggingUtils.logAll(LOG, "Unable to get CopyIn", ex);
|
LoggingUtils.logAll(LOG, "Unable to get CopyIn", ex);
|
||||||
close();
|
close();
|
||||||
throw new IOException(ex);
|
throw new IOException(ex);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void map(LongWritable key, Writable value, Context context)
|
public void map(LongWritable key, Writable value, Context context)
|
||||||
throws IOException, InterruptedException {
|
throws IOException, InterruptedException {
|
||||||
line.setLength(0);
|
if (bufferMode){
|
||||||
line.append(value.toString());
|
if (lineBuffer.append(fixEscapes(value.toString()))){
|
||||||
if (value instanceof Text) {
|
return;
|
||||||
line.append(System.getProperty("line.separator"));
|
}
|
||||||
}
|
/* else buffer is full lets write out */
|
||||||
try {
|
try {
|
||||||
byte[]data = line.toString().getBytes("UTF-8");
|
byte[]data=lineBuffer.getBytes();
|
||||||
copyin.writeToCopy(data, 0, data.length);
|
copyin.writeToCopy(data,0,data.length);
|
||||||
} catch (SQLException ex) {
|
lineBuffer.clear();
|
||||||
LoggingUtils.logAll(LOG, "Unable to execute copy", ex);
|
} catch (SQLException ex) {
|
||||||
close();
|
LoggingUtils.logAll(LOG, "Unable to execute copy", ex);
|
||||||
throw new IOException(ex);
|
close();
|
||||||
}
|
throw new IOException(ex);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* now write the new line that could not be appended because the buffer was full */
|
||||||
|
lineBuffer.append(fixEscapes(value.toString()));
|
||||||
|
} else { /* original unbuffered method */
|
||||||
|
line.setLength(0);
|
||||||
|
line.append(value.toString());
|
||||||
|
if (value instanceof Text) {
|
||||||
|
line.append(System.getProperty("line.separator"));
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
byte[]data = line.toString().getBytes("UTF-8");
|
||||||
|
copyin.writeToCopy(data, 0, data.length);
|
||||||
|
} catch (SQLException ex) {
|
||||||
|
LoggingUtils.logAll(LOG, "Unable to execute copy", ex);
|
||||||
|
close();
|
||||||
|
throw new IOException(ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected void cleanup(Context context)
|
protected void cleanup(Context context)
|
||||||
throws IOException, InterruptedException {
|
throws IOException, InterruptedException {
|
||||||
try {
|
try { /* write out the final fragment in the buffer */
|
||||||
copyin.endCopy();
|
if (bufferMode){
|
||||||
} catch (SQLException ex) {
|
byte[]data=lineBuffer.getBytes();
|
||||||
LoggingUtils.logAll(LOG, "Unable to finalize copy", ex);
|
copyin.writeToCopy(data,0,data.length);
|
||||||
throw new IOException(ex);
|
lineBuffer.clear();
|
||||||
|
}
|
||||||
|
copyin.endCopy();
|
||||||
|
} catch (SQLException ex) {
|
||||||
|
LoggingUtils.logAll(LOG, "Unable to finalize copy", ex);
|
||||||
|
throw new IOException(ex);
|
||||||
|
}
|
||||||
|
close();
|
||||||
}
|
}
|
||||||
close();
|
|
||||||
}
|
|
||||||
|
|
||||||
void close() throws IOException {
|
void close() throws IOException {
|
||||||
if (conn != null) {
|
if (conn != null) {
|
||||||
|
33
src/java/org/apache/sqoop/util/LineBuffer.java
Normal file
33
src/java/org/apache/sqoop/util/LineBuffer.java
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
package org.apache.sqoop.util;
|
||||||
|
import org.apache.commons.logging.Log;
|
||||||
|
import org.apache.commons.logging.LogFactory;
|
||||||
|
|
||||||
|
/* Buffer up lines of text until buffer is full.
|
||||||
|
* This helper class is just to support a test to
|
||||||
|
* see if the copyIn mapper for postgres is unbuffered
|
||||||
|
*
|
||||||
|
* */
|
||||||
|
public class LineBuffer{
|
||||||
|
public static final Log LOG = LogFactory.getLog(LineBuffer.class.getName());
|
||||||
|
|
||||||
|
private StringBuilder sb=new StringBuilder();
|
||||||
|
private static int MAXLEN=100000000;
|
||||||
|
//private static int MAXLEN=50000000;
|
||||||
|
|
||||||
|
public void clear(){ sb.setLength(0); }
|
||||||
|
public int length(){return sb.length();}
|
||||||
|
public boolean append(String s){
|
||||||
|
// LOG.debug(s);
|
||||||
|
if (sb.length()+s.length()+1>MAXLEN){return false;}
|
||||||
|
sb.append(s);
|
||||||
|
sb.append("\n");
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
public String toString(){return sb.toString();}
|
||||||
|
public byte[] getBytes() {
|
||||||
|
try {
|
||||||
|
//LOG.debug("returning "+new String(sb.toString().getBytes("UTF-8")));
|
||||||
|
return sb.toString().getBytes("UTF-8");
|
||||||
|
}catch(Exception e){e.printStackTrace();return null;}
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user