From 315fff82b906007922d7e4eaf84655c5373990e8 Mon Sep 17 00:00:00 2001 From: Bilung Lee Date: Fri, 28 Oct 2011 16:32:43 +0000 Subject: [PATCH] SQOOP-379 Migrate lib and io packages to new name space git-svn-id: https://svn.apache.org/repos/asf/incubator/sqoop/trunk@1190430 13f79535-47bb-0310-9956-ffa450edef68 --- build.xml | 2 +- src/java/com/cloudera/sqoop/io/CodecMap.java | 119 +- .../sqoop/io/FixedLengthInputStream.java | 70 +- src/java/com/cloudera/sqoop/io/LobFile.java | 1746 +--------------- .../com/cloudera/sqoop/io/LobReaderCache.java | 98 +- src/java/com/cloudera/sqoop/io/NamedFifo.java | 72 +- .../sqoop/io/SplittableBufferedWriter.java | 47 +- .../sqoop/io/SplittingOutputStream.java | 132 +- .../sqoop/io/UnsupportedCodecException.java | 11 +- .../sqoop/lib/BigDecimalSerializer.java | 38 +- src/java/com/cloudera/sqoop/lib/BlobRef.java | 82 +- .../com/cloudera/sqoop/lib/BooleanParser.java | 8 +- src/java/com/cloudera/sqoop/lib/ClobRef.java | 66 +- .../com/cloudera/sqoop/lib/DelimiterSet.java | 170 +- .../cloudera/sqoop/lib/FieldFormatter.java | 78 +- .../cloudera/sqoop/lib/FieldMapProcessor.java | 21 +- .../com/cloudera/sqoop/lib/FieldMappable.java | 16 +- .../sqoop/lib/JdbcWritableBridge.java | 165 +- .../cloudera/sqoop/lib/LargeObjectLoader.java | 280 +-- src/java/com/cloudera/sqoop/lib/LobRef.java | 293 +-- .../com/cloudera/sqoop/lib/LobSerializer.java | 18 +- .../sqoop/lib/ProcessingException.java | 15 +- .../com/cloudera/sqoop/lib/RecordParser.java | 304 +-- .../com/cloudera/sqoop/lib/SqoopRecord.java | 128 +- src/java/org/apache/sqoop/io/CodecMap.java | 170 ++ .../sqoop/io/FixedLengthInputStream.java | 87 + src/java/org/apache/sqoop/io/LobFile.java | 1821 +++++++++++++++++ .../org/apache/sqoop/io/LobReaderCache.java | 134 ++ src/java/org/apache/sqoop/io/NamedFifo.java | 94 + .../sqoop/io/SplittableBufferedWriter.java | 72 + .../sqoop/io/SplittingOutputStream.java | 159 ++ .../sqoop/io/UnsupportedCodecException.java | 38 + .../sqoop/lib/BigDecimalSerializer.java | 82 + src/java/org/apache/sqoop/lib/BlobRef.java | 130 ++ .../org/apache/sqoop/lib/BooleanParser.java | 41 + src/java/org/apache/sqoop/lib/ClobRef.java | 113 + .../org/apache/sqoop/lib/DelimiterSet.java | 205 ++ .../org/apache/sqoop/lib/FieldFormatter.java | 139 ++ .../apache/sqoop/lib/FieldMapProcessor.java | 39 + .../org/apache/sqoop/lib/FieldMappable.java | 34 + .../apache/sqoop/lib/JdbcWritableBridge.java | 256 +++ .../apache/sqoop/lib/LargeObjectLoader.java | 322 +++ src/java/org/apache/sqoop/lib/LobRef.java | 329 +++ .../org/apache/sqoop/lib/LobSerializer.java | 54 + .../apache/sqoop/lib/ProcessingException.java | 47 + .../org/apache/sqoop/lib/RecordParser.java | 371 ++++ .../org/apache/sqoop/lib/SqoopRecord.java | 159 ++ 47 files changed, 5119 insertions(+), 3756 deletions(-) create mode 100644 src/java/org/apache/sqoop/io/CodecMap.java create mode 100644 src/java/org/apache/sqoop/io/FixedLengthInputStream.java create mode 100644 src/java/org/apache/sqoop/io/LobFile.java create mode 100644 src/java/org/apache/sqoop/io/LobReaderCache.java create mode 100644 src/java/org/apache/sqoop/io/NamedFifo.java create mode 100644 src/java/org/apache/sqoop/io/SplittableBufferedWriter.java create mode 100644 src/java/org/apache/sqoop/io/SplittingOutputStream.java create mode 100644 src/java/org/apache/sqoop/io/UnsupportedCodecException.java create mode 100644 src/java/org/apache/sqoop/lib/BigDecimalSerializer.java create mode 100644 src/java/org/apache/sqoop/lib/BlobRef.java create mode 100644 src/java/org/apache/sqoop/lib/BooleanParser.java create mode 100644 src/java/org/apache/sqoop/lib/ClobRef.java create mode 100644 src/java/org/apache/sqoop/lib/DelimiterSet.java create mode 100644 src/java/org/apache/sqoop/lib/FieldFormatter.java create mode 100644 src/java/org/apache/sqoop/lib/FieldMapProcessor.java create mode 100644 src/java/org/apache/sqoop/lib/FieldMappable.java create mode 100644 src/java/org/apache/sqoop/lib/JdbcWritableBridge.java create mode 100644 src/java/org/apache/sqoop/lib/LargeObjectLoader.java create mode 100644 src/java/org/apache/sqoop/lib/LobRef.java create mode 100644 src/java/org/apache/sqoop/lib/LobSerializer.java create mode 100644 src/java/org/apache/sqoop/lib/ProcessingException.java create mode 100644 src/java/org/apache/sqoop/lib/RecordParser.java create mode 100644 src/java/org/apache/sqoop/lib/SqoopRecord.java diff --git a/build.xml b/build.xml index 85ef84a7..e629d6b5 100644 --- a/build.xml +++ b/build.xml @@ -808,7 +808,6 @@ + diff --git a/src/java/com/cloudera/sqoop/io/CodecMap.java b/src/java/com/cloudera/sqoop/io/CodecMap.java index ae517352..ffe949bc 100644 --- a/src/java/com/cloudera/sqoop/io/CodecMap.java +++ b/src/java/com/cloudera/sqoop/io/CodecMap.java @@ -1,6 +1,4 @@ /** - * Copyright 2011 The Apache Software Foundation - * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -17,22 +15,18 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - - package com.cloudera.sqoop.io; -import java.util.List; -import java.util.Map; import java.util.Set; -import java.util.TreeMap; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.compress.CompressionCodec; -import org.apache.hadoop.io.compress.CompressionCodecFactory; -import org.apache.hadoop.util.ReflectionUtils; /** * Provides a mapping from codec names to concrete implementation class names. + * + * @deprecated use org.apache.sqoop.io.CodecMap instead. + * @see org.apache.sqoop.io.CodecMap */ public final class CodecMap { @@ -40,33 +34,10 @@ public final class CodecMap { // Note: do not add more values here, since codecs are discovered using the // standard Hadoop mechanism (io.compression.codecs). See // CompressionCodecFactory. - public static final String NONE = "none"; - public static final String DEFLATE = "deflate"; - public static final String LZO = "lzo"; - public static final String LZOP = "lzop"; - - private static Map codecNames; - static { - codecNames = new TreeMap(); - - // Register the names of codecs we know about. - codecNames.put(NONE, null); - codecNames.put(DEFLATE, "org.apache.hadoop.io.compress.DefaultCodec"); - codecNames.put(LZO, "com.hadoop.compression.lzo.LzoCodec"); - codecNames.put(LZOP, "com.hadoop.compression.lzo.LzopCodec"); - - // add more from Hadoop CompressionCodecFactory - for (Class cls - : CompressionCodecFactory.getCodecClasses(new Configuration())) { - String simpleName = cls.getSimpleName(); - String codecName = simpleName; - if (simpleName.endsWith("Codec")) { - codecName = simpleName.substring(0, simpleName.length() - - "Codec".length()); - } - codecNames.put(codecName.toLowerCase(), cls.getCanonicalName()); - } - } + public static final String NONE = org.apache.sqoop.io.CodecMap.NONE; + public static final String DEFLATE = org.apache.sqoop.io.CodecMap.DEFLATE; + public static final String LZO = org.apache.sqoop.io.CodecMap.LZO; + public static final String LZOP = org.apache.sqoop.io.CodecMap.LZOP; private CodecMap() { } @@ -79,11 +50,7 @@ private CodecMap() { */ public static String getCodecClassName(String codecName) throws UnsupportedCodecException { - if (!codecNames.containsKey(codecName)) { - throw new UnsupportedCodecException(codecName); - } - - return codecNames.get(codecName); + return org.apache.sqoop.io.CodecMap.getCodecClassName(codecName); } /** @@ -94,79 +61,13 @@ public static String getCodecClassName(String codecName) */ public static CompressionCodec getCodec(String codecName, Configuration conf) throws UnsupportedCodecException { - // Try standard Hadoop mechanism first - CompressionCodec codec = getCodecByName(codecName, conf); - if (codec != null) { - return codec; - } - // Fall back to Sqoop mechanism - String codecClassName = null; - try { - codecClassName = getCodecClassName(codecName); - if (null == codecClassName) { - return null; - } - Class codecClass = - (Class) - conf.getClassByName(codecClassName); - return (CompressionCodec) ReflectionUtils.newInstance( - codecClass, conf); - } catch (ClassNotFoundException cnfe) { - throw new UnsupportedCodecException("Cannot find codec class " - + codecClassName + " for codec " + codecName); - } - } - - /** - * Find the relevant compression codec for the codec's canonical class name - * or by codec alias. - *

- * Codec aliases are case insensitive. - *

- * The code alias is the short class name (without the package name). - * If the short class name ends with 'Codec', then there are two aliases for - * the codec, the complete short class name and the short class name without - * the 'Codec' ending. For example for the 'GzipCodec' codec class name the - * alias are 'gzip' and 'gzipcodec'. - *

- * Note: When HADOOP-7323 is available this method can be replaced with a call - * to CompressionCodecFactory. - * @param classname the canonical class name of the codec or the codec alias - * @return the codec object or null if none matching the name were found - */ - private static CompressionCodec getCodecByName(String codecName, - Configuration conf) { - List> codecs = - CompressionCodecFactory.getCodecClasses(conf); - for (Class cls : codecs) { - if (codecMatches(cls, codecName)) { - return ReflectionUtils.newInstance(cls, conf); - } - } - return null; - } - - private static boolean codecMatches(Class cls, - String codecName) { - String simpleName = cls.getSimpleName(); - if (cls.getName().equals(codecName) - || simpleName.equalsIgnoreCase(codecName)) { - return true; - } - if (simpleName.endsWith("Codec")) { - String prefix = simpleName.substring(0, simpleName.length() - - "Codec".length()); - if (prefix.equalsIgnoreCase(codecName)) { - return true; - } - } - return false; + return org.apache.sqoop.io.CodecMap.getCodec(codecName, conf); } /** * Return the set of available codec names. */ public static Set getCodecNames() { - return codecNames.keySet(); + return org.apache.sqoop.io.CodecMap.getCodecNames(); } } diff --git a/src/java/com/cloudera/sqoop/io/FixedLengthInputStream.java b/src/java/com/cloudera/sqoop/io/FixedLengthInputStream.java index 15bd9248..806af22c 100644 --- a/src/java/com/cloudera/sqoop/io/FixedLengthInputStream.java +++ b/src/java/com/cloudera/sqoop/io/FixedLengthInputStream.java @@ -1,6 +1,4 @@ /** - * Copyright 2011 The Apache Software Foundation - * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -17,76 +15,24 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - - package com.cloudera.sqoop.io; import java.io.InputStream; -import java.io.IOException; - -import org.apache.commons.io.input.CloseShieldInputStream; -import org.apache.commons.io.input.CountingInputStream; -import org.apache.commons.io.input.ProxyInputStream; /** * Provides an InputStream that can consume a fixed maximum number of bytes * from an underlying stream. Closing the FixedLengthInputStream does not * close the underlying stream. After reading the maximum number of available * bytes this acts as though EOF has been reached. + * + * @deprecated use org.apache.sqoop.io.FixedLengthInputStream instead. + * @see org.apache.sqoop.io.FixedLengthInputStream */ -public class FixedLengthInputStream extends ProxyInputStream { +public class FixedLengthInputStream + extends org.apache.sqoop.io.FixedLengthInputStream { - private CountingInputStream countingIn; - private long maxBytes; - - public FixedLengthInputStream(InputStream stream, long maxLen) { - super(new CountingInputStream(new CloseShieldInputStream(stream))); - - // Save a correctly-typed reference to the underlying stream. - this.countingIn = (CountingInputStream) this.in; - this.maxBytes = maxLen; - } - - /** @return the number of bytes already consumed by the client. */ - private long consumed() { - return countingIn.getByteCount(); - } - - /** - * @return number of bytes remaining to be read before the limit - * is reached. - */ - private long toLimit() { - return maxBytes - consumed(); - } - - @Override - public int available() throws IOException { - return (int) Math.min(toLimit(), countingIn.available()); - } - - @Override - public int read() throws IOException { - if (toLimit() > 0) { - return super.read(); - } else { - return -1; // EOF. - } - } - - @Override - public int read(byte [] buf) throws IOException { - return read(buf, 0, buf.length); - } - - @Override - public int read(byte [] buf, int start, int count) throws IOException { - long limit = toLimit(); - if (limit == 0) { - return -1; // EOF. - } else { - return super.read(buf, start, (int) Math.min(count, limit)); - } - } + public FixedLengthInputStream(InputStream stream, long maxLen) { + super(stream, maxLen); + } } diff --git a/src/java/com/cloudera/sqoop/io/LobFile.java b/src/java/com/cloudera/sqoop/io/LobFile.java index 9de8432f..16878691 100644 --- a/src/java/com/cloudera/sqoop/io/LobFile.java +++ b/src/java/com/cloudera/sqoop/io/LobFile.java @@ -1,6 +1,4 @@ /** - * Copyright 2011 The Apache Software Foundation - * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -17,56 +15,13 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - - package com.cloudera.sqoop.io; -import java.io.BufferedOutputStream; -import java.io.Closeable; -import java.io.DataInput; -import java.io.DataInputStream; -import java.io.DataOutput; -import java.io.DataOutputStream; -import java.io.EOFException; -import java.io.InputStream; -import java.io.InputStreamReader; import java.io.IOException; -import java.io.OutputStream; -import java.io.OutputStreamWriter; -import java.io.UnsupportedEncodingException; -import java.util.AbstractMap; -import java.util.Arrays; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.TreeMap; -import org.apache.commons.io.output.CloseShieldOutputStream; -import org.apache.commons.io.output.CountingOutputStream; import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.FSDataInputStream; -import org.apache.hadoop.fs.FSDataOutputStream; -import org.apache.hadoop.fs.Path; import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.BytesWritable; -import org.apache.hadoop.io.DataInputBuffer; -import org.apache.hadoop.io.DataOutputBuffer; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.io.Writable; -import org.apache.hadoop.io.WritableUtils; -import org.apache.hadoop.io.compress.CompressionCodec; -import org.apache.hadoop.io.compress.Compressor; -import org.apache.hadoop.io.compress.CompressorStream; -import org.apache.hadoop.io.compress.Decompressor; -import org.apache.hadoop.io.compress.DecompressorStream; - -import com.cloudera.sqoop.util.RandomHash; +import org.apache.hadoop.fs.Path; /** * File format which stores large object records. @@ -75,1698 +30,55 @@ * Each record is assigned an id and can be accessed by id efficiently by * consulting an index at the end of the file. * - * The LobFile format is specified at: - * http://wiki.github.com/cloudera/sqoop/sip-3 + * @deprecated use org.apache.sqoop.io.LobFile instead. + * @see org.apache.sqoop.io.LobFile */ public final class LobFile { private LobFile() { } - public static final Log LOG = LogFactory.getLog(LobFile.class.getName()); + public static final Log LOG = org.apache.sqoop.io.LobFile.LOG; - public static final int LATEST_LOB_VERSION = 0; - static final char [] HEADER_ID_STR = { 'L', 'O', 'B' }; + public static final int LATEST_LOB_VERSION = + org.apache.sqoop.io.LobFile.LATEST_LOB_VERSION; + + // Must be in sync with org.apache.sqoop.io.LobFile.HEADER_ID_STR + static final char [] HEADER_ID_STR = + org.apache.sqoop.io.LobFile.HEADER_ID_STR; // Value for entryId to write to the beginning of an IndexSegment. - static final long SEGMENT_HEADER_ID = -1; + static final long SEGMENT_HEADER_ID = + org.apache.sqoop.io.LobFile.SEGMENT_HEADER_ID; // Value for entryId to write before the finale. - static final long SEGMENT_OFFSET_ID = -2; + static final long SEGMENT_OFFSET_ID = + org.apache.sqoop.io.LobFile.SEGMENT_OFFSET_ID; // Value for entryID to write before the IndexTable - static final long INDEX_TABLE_ID = -3; + static final long INDEX_TABLE_ID = org.apache.sqoop.io.LobFile.INDEX_TABLE_ID; /** - * Represents a header block in a LobFile. Can write a new header - * block (and generate a record start mark), or read an existing - * header block. + * @deprecated use org.apache.sqoop.io.LobFile.Writer + * @see org.apache.sqoop.io.LobFile.Writer */ - private static class LobFileHeader implements Writable { - - private int version; - private RecordStartMark startMark; - private MetaBlock metaBlock; - - /** - * Create a new LobFileHeader. - */ - public LobFileHeader() { - this.version = LATEST_LOB_VERSION; - this.startMark = new RecordStartMark(); - this.metaBlock = new MetaBlock(); - } - - /** - * Read a LobFileHeader from an existing file. - */ - public LobFileHeader(DataInput in) throws IOException { - readFields(in); - } - - /** - * Write a LobFile header to an output sink. - */ - public void write(DataOutput out) throws IOException { - // Start with the file type identification. - for (char c : HEADER_ID_STR) { - out.writeByte((int) c); - } - - // Write the format version - WritableUtils.writeVInt(out, this.version); - - startMark.write(out); - metaBlock.write(out); - } - - public void readFields(DataInput in) throws IOException { - char [] chars = new char[3]; - for (int i = 0; i < 3; i++) { - chars[i] = (char) in.readByte(); - } - - // Check that these match what we expect. Throws IOE if not. - checkHeaderChars(chars); - - this.version = WritableUtils.readVInt(in); - if (this.version != LATEST_LOB_VERSION) { - // Right now we only have one version we can handle. - throw new IOException("Unexpected LobFile version " + this.version); - } - - this.startMark = new RecordStartMark(in); - this.metaBlock = new MetaBlock(in); - } - - /** - * Checks that a header array matches the standard LobFile header. - * Additional data at the end of the headerStamp is ignored. - * @param headerStamp the header bytes received from the file. - * @throws IOException if it doesn't. - */ - private void checkHeaderChars(char [] headerStamp) throws IOException { - if (headerStamp.length != HEADER_ID_STR.length) { - throw new IOException("Invalid LobFile header stamp: expected length " - + HEADER_ID_STR.length); - } - for (int i = 0; i < HEADER_ID_STR.length; i++) { - if (headerStamp[i] != HEADER_ID_STR[i]) { - throw new IOException("Invalid LobFile header stamp"); - } - } - } - - /** - * @return the format version number for this LobFile - */ - public int getVersion() { - return version; - } - - /** - * @return the RecordStartMark for this LobFile. - */ - public RecordStartMark getStartMark() { - return startMark; - } - - /** - * @return the MetaBlock for this LobFile. - */ - public MetaBlock getMetaBlock() { - return metaBlock; - } + public abstract static class Writer + extends org.apache.sqoop.io.LobFile.Writer { } /** - * Holds a RecordStartMark -- a 16 byte randomly-generated - * sync token. Can read a RSM from an input source, or can - * generate a new one. + * @deprecated use org.apache.sqoop.io.LobFile.Reader instead. + * @see org.apache.sqoop.io.LobFile.Reader */ - private static class RecordStartMark implements Writable { - - // This is a 16-byte array. - public static final int START_MARK_LENGTH = 16; - - private byte [] startBytes; - - public RecordStartMark() { - generateStartMark(); - } - - public RecordStartMark(DataInput in) throws IOException { - readFields(in); - } - - public byte [] getBytes() { - byte [] out = new byte[START_MARK_LENGTH]; - System.arraycopy(this.startBytes, 0, out, 0, START_MARK_LENGTH); - return out; - } - - public void readFields(DataInput in) throws IOException { - this.startBytes = new byte[START_MARK_LENGTH]; - in.readFully(this.startBytes); - } - - public void write(DataOutput out) throws IOException { - out.write(this.startBytes); - } - - /** - * Generate a new random RecordStartMark. - */ - private void generateStartMark() { - this.startBytes = RandomHash.generateMD5Bytes(); - } - } - - /** - * Represents the metadata block stored in the header of a LobFile. - */ - private static class MetaBlock extends AbstractMap - implements Writable { - - // Strings which typically appear in the metablock have canonical names. - public static final String ENTRY_ENCODING_KEY = "EntryEncoding"; - public static final String COMPRESSION_CODEC_KEY = "CompressionCodec"; - public static final String ENTRIES_PER_SEGMENT_KEY = "EntriesPerSegment"; - - // Standard entry encodings. - public static final String CLOB_ENCODING = "CLOB"; - public static final String BLOB_ENCODING = "BLOB"; - - private Map entries; - - public MetaBlock() { - entries = new TreeMap(); - } - - public MetaBlock(DataInput in) throws IOException { - entries = new TreeMap(); - readFields(in); - } - - public MetaBlock(Map map) { - entries = new TreeMap(); - for (Map.Entry entry : map.entrySet()) { - entries.put(entry.getKey(), entry.getValue()); - } - } - - @Override - public Set> entrySet() { - return entries.entrySet(); - } - - @Override - public BytesWritable put(String k, BytesWritable v) { - BytesWritable old = entries.get(k); - entries.put(k, v); - return old; - } - - public BytesWritable put(String k, String v) { - try { - return put(k, new BytesWritable(v.getBytes("UTF-8"))); - } catch (UnsupportedEncodingException uee) { - // Shouldn't happen; UTF-8 is always supported. - throw new RuntimeException(uee); - } - } - - @Override - public BytesWritable get(Object k) { - return entries.get(k); - } - - public String getString(Object k) { - BytesWritable bytes = get(k); - if (null == bytes) { - return null; - } else { - try { - return new String(bytes.getBytes(), 0, bytes.getLength(), "UTF-8"); - } catch (UnsupportedEncodingException uee) { - // Shouldn't happen; UTF-8 is always supported. - throw new RuntimeException(uee); - } - } - } - - public void readFields(DataInput in) throws IOException { - int numEntries = WritableUtils.readVInt(in); - entries.clear(); - for (int i = 0; i < numEntries; i++) { - String key = Text.readString(in); - BytesWritable val = new BytesWritable(); - val.readFields(in); - entries.put(key, val); - } - } - - public void write(DataOutput out) throws IOException { - int numEntries = entries.size(); - WritableUtils.writeVInt(out, numEntries); - for (Map.Entry entry : entries.entrySet()) { - Text.writeString(out, entry.getKey()); - entry.getValue().write(out); - } - } - } - - /** - * Class that represents the IndexSegment entries in a LobIndex. - */ - private static class IndexSegment implements Writable { - - // The main body of the IndexSegment: the record lengths - // of all the records in the IndexSegment. - private BytesWritable recordLenBytes; - - // The length of the previously recorded field (used when - // generating an index). Intermediate state used in calculation - // of the lastIndexOffset. - private long prevLength; - - // Used to write VLong-encoded lengths into a temp - // array, which are then copied into recordLenBytes. - private DataOutputBuffer outputBuffer; - - // The IndexTableEntry that describes this IndexSegment in the IndexTable. - private IndexTableEntry tableEntry; - - public IndexSegment(IndexTableEntry tableEntry) { - this.recordLenBytes = new BytesWritable(); - this.outputBuffer = new DataOutputBuffer(10); // max VLong size. - this.tableEntry = tableEntry; - } - - /** - * Read an IndexSegment from an existing file. - */ - public IndexSegment(IndexTableEntry tableEntry, DataInput in) - throws IOException { - this.recordLenBytes = new BytesWritable(); - this.outputBuffer = new DataOutputBuffer(10); - this.tableEntry = tableEntry; - readFields(in); - } - - /** - * @return the IndexTableEntry describing this IndexSegment in the - * IndexTable. - */ - public IndexTableEntry getTableEntry() { - return tableEntry; - } - - /** - * Add a recordLength to the recordLenBytes array. - */ - public void addRecordLen(long recordLen) throws IOException { - // Allocate space for the new bytes. - int numBytes = WritableUtils.getVIntSize(recordLen); - recordLenBytes.setSize(recordLenBytes.getLength() + numBytes); - - // Write the new bytes into a temporary buffer wrapped in a DataOutput. - outputBuffer.reset(); - WritableUtils.writeVLong(outputBuffer, recordLen); - - // Then copy those new bytes into the end of the recordLenBytes array. - System.arraycopy(outputBuffer.getData(), 0, recordLenBytes.getBytes(), - recordLenBytes.getLength() - numBytes, numBytes); - - // Now that we've added a new recordLength to the array, - // it's the last index. We need to calculate its offset. - // This is based on how long the previous record was. - this.tableEntry.setLastIndexOffset( - this.tableEntry.getLastIndexOffset() + this.prevLength); - - // Save this record's length (unserialized) for calculating - // lastIndexOffset for the next record. - this.prevLength = recordLen; - } - - public void write(DataOutput out) throws IOException { - // Write the SEGMENT_HEADER_ID to distinguish this from a LobRecord. - WritableUtils.writeVLong(out, SEGMENT_HEADER_ID); - - // The length of the main body of the segment is the length of the - // data byte array. - int segmentBytesLen = recordLenBytes.getLength(); - WritableUtils.writeVLong(out, segmentBytesLen); - - // Write the body of the segment. - out.write(recordLenBytes.getBytes(), 0, segmentBytesLen); - } - - public void readFields(DataInput in) throws IOException { - // After the RecordStartMark, we expect to get a SEGMENT_HEADER_ID (-1). - long segmentId = WritableUtils.readVLong(in); - if (SEGMENT_HEADER_ID != segmentId) { - throw new IOException("Expected segment header id " + SEGMENT_HEADER_ID - + "; got " + segmentId); - } - - // Get the length of the rest of the segment, in bytes. - long length = WritableUtils.readVLong(in); - - // Now read the actual main byte array. - if (length > Integer.MAX_VALUE) { - throw new IOException("Unexpected oversize data array length: " - + length); - } else if (length < 0) { - throw new IOException("Unexpected undersize data array length: " - + length); - } - byte [] segmentData = new byte[(int) length]; - in.readFully(segmentData); - recordLenBytes = new BytesWritable(segmentData); - - reset(); // Reset the iterator allowing the user to yield offset/lengths. - } - - - // The following methods are used by a Reader to walk through the index - // segment and get data about the records described in this segment of - // the index. - - private DataInputBuffer dataInputBuf; - - // The following two fields are advanced by the next() method. - private long curOffset; // offset into the file of the current record. - private long curLen; // length of the current record in bytes. - - // Used to allow rewindOnce() to go backwards a single position in the - // iterator. - private int prevInputBufPos; // prev offset into dataInputBuf. - private long prevOffset; - private long prevLen; - - /** - * Resets the record index iterator. - */ - public void reset() { - this.dataInputBuf = null; - } - - /** - * Aligns the iteration capability to return info about the next - * record in the IndexSegment. Must be called before the first - * record. - * @return true if there is another record described in this IndexSegment. - */ - public boolean next() { - this.prevOffset = this.curOffset; - if (null == dataInputBuf) { - // We need to set up the iterator; this is the first use. - if (null == recordLenBytes) { - return false; // We don't have any records? - } - - this.dataInputBuf = new DataInputBuffer(); - this.dataInputBuf.reset(recordLenBytes.getBytes(), - 0, recordLenBytes.getLength()); - - this.curOffset = this.tableEntry.getFirstIndexOffset(); - this.prevOffset = 0; - } else { - this.curOffset += this.curLen; - } - - boolean available = dataInputBuf.getPosition() < dataInputBuf.getLength(); - if (available) { - this.prevInputBufPos = dataInputBuf.getPosition(); - // Then read out the next record length. - try { - this.prevLen = this.curLen; - this.curLen = WritableUtils.readVLong(dataInputBuf); - } catch (IOException ioe) { - // Shouldn't happen; data in DataInputBuffer is materialized. - throw new RuntimeException(ioe); - } - } - - return available; - } - - /** - * Undoes a single call to next(). This cannot be called twice in a row; - * before calling this again, next() must be called in the interim. This - * makes a subsequent call to next() yield the same iterated values as the - * previous call. - */ - public void rewindOnce() { - // Move the buffer backwards so we deserialize the same VLong with - // the next call. - if (prevInputBufPos == 0) { - // We actually rewound the first next() in the iterator. - // Just reset the iterator to the beginning. Otherwise we'll - // backfill it with bogus data. - reset(); - } else { - // Use the normal codepath; move the serialization buffer - // backwards and restores the previously yielded values. - dataInputBuf.reset(recordLenBytes.getBytes(), prevInputBufPos, - recordLenBytes.getLength() - prevInputBufPos); - - // And restore the previously-yielded values. - this.curLen = this.prevLen; - this.curOffset = this.prevOffset; - } - } - - /** - * Returns the length of the current record. - * You must call next() and it must return true before calling this method. - * @return the length in bytes of the current record. - */ - public long getCurRecordLen() { - return curLen; - } - - /** - * Returns the offset of the current record from the beginning of the file. - * You must call next() and it must return true before calling this method. - * @return the offset in bytes from the beginning of the file for the - * current record. - */ - public long getCurRecordStart() { - return curOffset; - } - } - - /** - * Describes an IndexSegment. This is one entry in the IndexTable. It - * holds the physical location of the IndexSegment in the file, as well - * as the range of entryIds and byte ranges corresponding to records - * described by the index subset in the IndexSegment. - */ - private static class IndexTableEntry implements Writable { - private long segmentOffset; - private long firstIndexId; - private long firstIndexOffset; - private long lastIndexOffset; - - public IndexTableEntry() { - } - - public IndexTableEntry(DataInput in) throws IOException { - readFields(in); - } - - private void setSegmentOffset(long offset) { - this.segmentOffset = offset; - } - - private void setFirstIndexId(long id) { - this.firstIndexId = id; - } - - private void setFirstIndexOffset(long offset) { - this.firstIndexOffset = offset; - } - - private void setLastIndexOffset(long offset) { - this.lastIndexOffset = offset; - } - - public void write(DataOutput out) throws IOException { - WritableUtils.writeVLong(out, segmentOffset); - WritableUtils.writeVLong(out, firstIndexId); - WritableUtils.writeVLong(out, firstIndexOffset); - WritableUtils.writeVLong(out, lastIndexOffset); - } - - public void readFields(DataInput in) throws IOException { - segmentOffset = WritableUtils.readVLong(in); - firstIndexId = WritableUtils.readVLong(in); - firstIndexOffset = WritableUtils.readVLong(in); - lastIndexOffset = WritableUtils.readVLong(in); - } - - /** - * @return the entryId of the first record indexed by this segment. - */ - public long getFirstIndexId() { - return this.firstIndexId; - } - - /** - * @return the offset of the first record indexed by this segment. - */ - public long getFirstIndexOffset() { - return this.firstIndexOffset; - } - - /** - * @return the offset of the last record indexed by this segment. - */ - public long getLastIndexOffset() { - return this.lastIndexOffset; - } - - /** - * @return the offset from the start of the file of the IndexSegment - * data itself. - */ - public long getSegmentOffset() { - return this.segmentOffset; - } - - /** - * Inform whether the user's requested offset corresponds - * to a record that starts in this IndexSegment. If this - * returns true, the requested offset may actually be in - * a previous IndexSegment. - * @param off the offset of the start of a record to test. - * @return true if the user's requested offset is in this - * or a previous IndexSegment. - */ - public boolean containsOffset(long off) { - return off <= getLastIndexOffset(); - } - } - - /** - * Stores the locations and ranges indexed by each IndexSegment. - */ - private static class IndexTable - implements Iterable, Writable { - private List tableEntries; - - public IndexTable() { - tableEntries = new ArrayList(); - } - - public IndexTable(DataInput in) throws IOException { - readFields(in); - } - - public void readFields(DataInput in) throws IOException { - long recordTypeId = WritableUtils.readVLong(in); - if (recordTypeId != INDEX_TABLE_ID) { - // We expected to read an IndexTable. - throw new IOException("Expected IndexTable; got record with typeId=" - + recordTypeId); - } - - int tableCount = WritableUtils.readVInt(in); - - tableEntries = new ArrayList(tableCount); - for (int i = 0; i < tableCount; i++) { - tableEntries.add(new IndexTableEntry(in)); - } - } - - public void write(DataOutput out) throws IOException { - // Start with the record type id. - WritableUtils.writeVLong(out, INDEX_TABLE_ID); - - // Then the count of the records. - WritableUtils.writeVInt(out, tableEntries.size()); - - // Followed by the table itself. - for (IndexTableEntry entry : tableEntries) { - entry.write(out); - } - } - - public void add(IndexTableEntry entry) { - tableEntries.add(entry); - } - - public IndexTableEntry get(int i) { - return tableEntries.get(i); - } - - public int size() { - return tableEntries.size(); - } - - public Iterator iterator() { - return tableEntries.iterator(); - } - } - - /** - * Class that writes out a LobFile. Instantiate via LobFile.create(). - */ - public abstract static class Writer implements Closeable { - - /** - * If this Writer is writing to a physical LobFile, then this returns - * the file path it is writing to. Otherwise it returns null. - * @return the fully-qualified path being written to by this writer. - */ - public abstract Path getPath(); - - /** - * Finishes writing the LobFile and closes underlying handles. - */ - public abstract void close() throws IOException; - - @Override - protected synchronized void finalize() throws Throwable { - close(); - super.finalize(); - } - - /** - * Terminates the current record and writes any trailing zero-padding - * required by the specified record size. - * This is implicitly called between consecutive writeBlobRecord() / - * writeClobRecord() calls. - */ - public abstract void finishRecord() throws IOException; - - /** - * Declares a new BLOB record to be written to the file. - * @param len the "claimed" number of bytes that will be written to - * this record. The actual number of bytes may differ. - */ - public abstract OutputStream writeBlobRecord(long len) throws IOException; - - /** - * Declares a new CLOB record to be written to the file. - * @param len the claimed number of characters that will be written to - * this record. The actual number of characters may differ. - */ - public abstract java.io.Writer writeClobRecord(long len) - throws IOException; - - /** - * Report the current position in the output file. - * @return the number of bytes written through this Writer. - */ - public abstract long tell() throws IOException; - - /** - * Checks whether an underlying stream is present or null. - * @param out the stream to check for null-ness. - * @throws IOException if out is null. - */ - protected void checkForNull(OutputStream out) throws IOException { - if (null == out) { - throw new IOException("Writer has been closed."); - } - } - } - - /** - * Concrete writer implementation for LobFile format version 0. - * Instantiate via LobFile.create(). - */ - private static class V0Writer extends Writer { - public static final Log LOG = LogFactory.getLog( - V0Writer.class.getName()); - - private Configuration conf; - private Path path; - private boolean isCharData; - private LobFileHeader header; - - private String codecName; - private CompressionCodec codec; - private Compressor compressor; - - // The LobIndex we are constructing. - private LinkedList indexSegments; - // Number of entries in the current IndexSegment. - private int entriesInSegment; - private IndexTable indexTable; - - // Number of entries that can be written to a single IndexSegment. - private int maxEntriesPerSegment; - - // By default we write this many entries per IndexSegment. - static final int DEFAULT_MAX_SEGMENT_ENTRIES = 4096; - - // Our OutputStream to the underlying file. - private DataOutputStream out; - - // 'out' is layered on top of this stream, which gives us a count - // of how much data we've written so far. - private CountingOutputStream countingOut; - - // State regarding the current record being written. - private long curEntryId; // entryId of the current LOB being written. - private long curClaimedLen; // The user claims a length for a record. - - // The user's OutputStream and/or Writer that writes to us. - private OutputStream userOutputStream; - private java.io.Writer userWriter; - - // The userCountingOutputStream may be the same as userOutputStream; - // but if the user is writing through a compressor, it is actually - // underneath of it. This tells us how many compressed bytes were - // really written. - private CountingOutputStream userCountingOutputStream; - - /** - * Creates a LobFile Writer for file format version 0. - * @param p the path to create. - * @param conf the configuration to use to interact with the filesystem. - * @param isCharData true if this is for CLOBs, false for BLOBs. - * @param codecName the compression codec to use (or null for none). - * @param entriesPerSegment the number of index entries per IndexSegment. - */ - V0Writer(Path p, Configuration conf, boolean isCharData, - String codecName, int entriesPerSegment) throws IOException { - - this.path = LobReaderCache.qualify(p, conf); - this.conf = conf; - this.isCharData = isCharData; - this.header = new LobFileHeader(); - this.indexSegments = new LinkedList(); - this.indexTable = new IndexTable(); - this.maxEntriesPerSegment = entriesPerSegment; - - this.codecName = codecName; - if (this.codecName != null) { - this.codec = CodecMap.getCodec(codecName, conf); - if (null != this.codec) { - this.compressor = codec.createCompressor(); - } - } - - init(); - } - - /** - * Open the file and write its header. - */ - private void init() throws IOException { - FileSystem fs = this.path.getFileSystem(conf); - FSDataOutputStream fsOut = fs.create(this.path); - this.countingOut = new CountingOutputStream( - new BufferedOutputStream(fsOut)); - this.out = new DataOutputStream(this.countingOut); - - // put any necessary config strings into the header. - MetaBlock m = this.header.getMetaBlock(); - if (isCharData) { - m.put(MetaBlock.ENTRY_ENCODING_KEY, MetaBlock.CLOB_ENCODING); - } else { - m.put(MetaBlock.ENTRY_ENCODING_KEY, MetaBlock.BLOB_ENCODING); - } - - if (null != codec) { - m.put(MetaBlock.COMPRESSION_CODEC_KEY, this.codecName); - } - - // Serialize the value of maxEntriesPerSegment as a VInt in a byte array - // and put that into the metablock as ENTRIES_PER_SEGMENT_KEY. - int segmentBufLen = WritableUtils.getVIntSize(this.maxEntriesPerSegment); - DataOutputBuffer entriesPerSegBuf = new DataOutputBuffer(segmentBufLen); - WritableUtils.writeVInt(entriesPerSegBuf, this.maxEntriesPerSegment); - byte [] entriesPerSegArray = - Arrays.copyOf(entriesPerSegBuf.getData(), segmentBufLen); - m.put(MetaBlock.ENTRIES_PER_SEGMENT_KEY, - new BytesWritable(entriesPerSegArray)); - - // Write the file header to the file. - this.header.write(out); - - // Now we're ready to accept record data from the user. - } - - @Override - /** {@inheritDoc} */ - public Path getPath() { - return this.path; - } - - @Override - /** - * {@inheritDoc} - */ - public long tell() throws IOException { - checkForNull(this.out); - this.out.flush(); - return this.countingOut.getByteCount(); - } - - @Override - /** - * {@inheritDoc} - */ - public void close() throws IOException { - finishRecord(); - writeIndex(); - if (this.out != null) { - this.out.close(); - this.out = null; - } - - if (this.countingOut != null) { - this.countingOut.close(); - this.countingOut = null; - } - } - - @Override - /** - * {@inheritDoc} - */ - public void finishRecord() throws IOException { - if (null != this.userWriter) { - this.userWriter.close(); - this.userWriter = null; - } - - if (null != this.userCountingOutputStream) { - - // If there is a wrapping stream for compression, - // close this first. - if (null != this.userOutputStream - && this.userOutputStream != this.userCountingOutputStream) { - this.userOutputStream.close(); - } - - // Now close the "main" stream. - this.userCountingOutputStream.close(); - - // Write the true length of the current record to the index. - updateIndex(this.userCountingOutputStream.getByteCount() - + RecordStartMark.START_MARK_LENGTH - + WritableUtils.getVIntSize(curEntryId) - + WritableUtils.getVIntSize(curClaimedLen)); - - this.userOutputStream = null; - this.userCountingOutputStream = null; - } - - if (null != this.out) { - out.flush(); - } - } - - /** - * Write in the current IndexSegment, the true compressed length of the - * record we just finished writing. - * @param curRecordLen the true length in bytes of the compressed record. - */ - private void updateIndex(long curRecordLen) throws IOException { - LOG.debug("Adding index entry: id=" + curEntryId - + "; len=" + curRecordLen); - indexSegments.getLast().addRecordLen(curRecordLen); - entriesInSegment++; - curEntryId++; - } - - /** - * Write the index itself to the file. - */ - private void writeIndex() throws IOException { - - // Write out all the segments in turn. - // As we do so, reify their offsets into the IndexTable. - for (IndexSegment segment : indexSegments) { - long segmentOffset = tell(); - segment.getTableEntry().setSegmentOffset(segmentOffset); - - header.getStartMark().write(out); - segment.write(out); - } - - long indexTableStartPos = tell(); // Save for the end of the file. - LOG.debug("IndexTable offset: " + indexTableStartPos); - - header.getStartMark().write(out); - indexTable.write(out); // write the IndexTable record. - - // Write the finale that tells us where the IndexTable begins. - header.getStartMark().write(out); - WritableUtils.writeVLong(out, SEGMENT_OFFSET_ID); - WritableUtils.writeVLong(out, indexTableStartPos); - } - - /** - * Prepare to index a new record that will soon be written to the file. - * If this is is the first record in the current IndexSegment, we need - * to record its entryId and the current file position. - */ - private void startRecordIndex() throws IOException { - if (entriesInSegment == maxEntriesPerSegment - || indexSegments.size() == 0) { - // The current segment is full. Start a new one. - this.entriesInSegment = 0; - IndexTableEntry tableEntry = new IndexTableEntry(); - IndexSegment curSegment = new IndexSegment(tableEntry); - this.indexSegments.add(curSegment); - - long filePos = tell(); - LOG.debug("Starting IndexSegment; first id=" + curEntryId - + "; off=" + filePos); - tableEntry.setFirstIndexId(curEntryId); - tableEntry.setFirstIndexOffset(filePos); - tableEntry.setLastIndexOffset(filePos); - this.indexTable.add(tableEntry); - } - } - - @Override - /** - * {@inheritDoc} - */ - public OutputStream writeBlobRecord(long claimedLen) throws IOException { - finishRecord(); // finish any previous record. - checkForNull(this.out); - startRecordIndex(); - this.header.getStartMark().write(out); - LOG.debug("Starting new record; id=" + curEntryId - + "; claimedLen=" + claimedLen); - WritableUtils.writeVLong(out, curEntryId); - WritableUtils.writeVLong(out, claimedLen); - this.curClaimedLen = claimedLen; - this.userCountingOutputStream = new CountingOutputStream( - new CloseShieldOutputStream(out)); - if (null == this.codec) { - // No codec; pass thru the same OutputStream to the user. - this.userOutputStream = this.userCountingOutputStream; - } else { - // Wrap our CountingOutputStream in a compressing OutputStream to - // give to the user. - this.compressor.reset(); - this.userOutputStream = new CompressorStream( - this.userCountingOutputStream, compressor); - } - - return this.userOutputStream; - } - - @Override - /** - * {@inheritDoc} - */ - public java.io.Writer writeClobRecord(long len) throws IOException { - if (!isCharData) { - throw new IOException( - "Can only write CLOB data to a Clob-specific LobFile"); - } - - // Get a binary handle to the record and wrap it in a java.io.Writer. - writeBlobRecord(len); - this.userWriter = new OutputStreamWriter(userOutputStream); - return this.userWriter; - } - } - - /** - * Class that can read a LobFile. Create with LobFile.open(). - */ - public abstract static class Reader implements Closeable { - /** - * If this Reader is reading from a physical LobFile, then this returns - * the file path it is reading from. Otherwise it returns null. - * @return the fully-qualified path being read by this reader. - */ - public abstract Path getPath(); - - /** - * Report the current position in the file. Note that the internal - * cursor may move in an unpredictable fashion; e.g., to fetch - * additional data from the index stored at the end of the file. - * Clients may be more interested in the getRecordOffset() method - * which returns the starting offset of the current record. - * @return the current offset from the start of the file in bytes. - */ - public abstract long tell() throws IOException; - - /** - * Move the file pointer to the first available full record beginning at - * position 'pos', relative to the start of the file. After calling - * seek(), you will need to call next() to move to the record itself. - * @param pos the position to seek to or past. - */ - public abstract void seek(long pos) throws IOException; - - /** - * Advances to the next record in the file. - * @return true if another record exists, or false if the - * end of the file has been reached. - */ - public abstract boolean next() throws IOException; - - /** - * @return true if we have aligned the Reader (through a call to next()) - * onto a record. - */ - public abstract boolean isRecordAvailable(); - - /** - * Reports the length of the record to the user. - * If next() has not been called, or seek() has been called without - * a subsequent call to next(), or next() returned false, the return - * value of this method is undefined. - * @return the 'claimedLen' field of the current record. For - * character-based records, this is often in characters, not bytes. - * Records may have more bytes associated with them than are reported - * by this method, but never fewer. - */ - public abstract long getRecordLen(); - - /** - * Return the entryId of the current record to the user. - * If next() has not been called, or seek() has been called without - * a subsequent call to next(), or next() returned false, the return - * value of this method is undefined. - * @return the 'entryId' field of the current record. - */ - public abstract long getRecordId(); - - /** - * Return the byte offset at which the current record starts. - * If next() has not been called, or seek() has been called without - * a subsequent call to next(), or next() returned false, the return - * value of this method is undefined. - * @return the byte offset of the beginning of the current record. - */ - public abstract long getRecordOffset(); - - /** - * @return an InputStream allowing the user to read the next binary - * record from the file. - */ - public abstract InputStream readBlobRecord() throws IOException; - - /** - * @return a java.io.Reader allowing the user to read the next character - * record from the file. - */ - public abstract java.io.Reader readClobRecord() throws IOException; - - /** - * Closes the reader. - */ - public abstract void close() throws IOException; - - /** - * Checks whether an underlying stream is present or null. - * @param in the stream to check for null-ness. - * @throws IOException if in is null. - */ - protected void checkForNull(InputStream in) throws IOException { - if (null == in) { - throw new IOException("Reader has been closed."); - } - } - - /** - * @return true if the Reader.close() method has been called. - */ - public abstract boolean isClosed(); - - @Override - protected synchronized void finalize() throws Throwable { - close(); - super.finalize(); - } - } - - /** - * Reader implementation for LobFile format version 0. Acquire with - * LobFile.open(). - */ - private static class V0Reader extends Reader { - public static final Log LOG = LogFactory.getLog( - V0Reader.class.getName()); - - // Forward seeks of up to this size are performed by reading, not seeking. - private static final long MAX_CONSUMPTION_WIDTH = 512 * 1024; - - private LobFileHeader header; - - private Configuration conf; - - // Codec to use to decompress the file. - private CompressionCodec codec; - private Decompressor decompressor; - - // Length of the entire file. - private long fileLen; - - // State bit set to true after we've called next() and successfully - // aligned on a record. If true, we can hand an InputStream back to - // the user. - private boolean isAligned; - - // After we've aligned on a record, this contains the record's - // reported length. In the presence of compression, etc, this may - // not represent its true length in the file. - private long claimedRecordLen; - - // After we've aligned on a record, this contains its entryId. - private long curEntryId; - - // After we've aligned on a record, this contains the offset of the - // beginning of its RSM from the start of the file. - private long curRecordOffset; - - // After we've aligned on a record, this contains the record's - // true length from the index. - private long indexRecordLen; - - // tmp buffer used to consume RecordStartMarks during alignment. - private byte [] tmpRsmBuf; - - // The actual file stream itself, which we can move around (e.g. with - // seeking). - private FSDataInputStream underlyingInput; - - // The data deserializer we typically place on top of this. - // If we use underlyingInput.seek(), then we instantiate a new - // dataIn on top of it. - private DataInputStream dataIn; - - // The user accesses the current record through a stream memoized here. - // We retain a pointer here so that we can forcibly close the old - // userInputStream when they want to align on the next record. - private InputStream userInputStream; - - // The current index segment to read record lengths from. - private IndexSegment curIndexSegment; - - // The offset into the indexTable of the curIndexSegment. - private int curIndexSegmentId; - - // The IndexTable that provides fast pointers to the IndexSegments. - private IndexTable indexTable; - - // The path being opened. - private Path path; - - // Users should use LobFile.open() instead of directly calling this. - V0Reader(Path path, Configuration conf, LobFileHeader header, - DataInputStream dis, FSDataInputStream stream, long fileLen) - throws IOException { - this.path = LobReaderCache.qualify(path, conf); - this.conf = conf; - this.header = header; - this.dataIn = dis; - this.underlyingInput = stream; - this.isAligned = false; - this.tmpRsmBuf = new byte[RecordStartMark.START_MARK_LENGTH]; - this.fileLen = fileLen; - LOG.debug("Opening LobFile path: " + path); - openCodec(); - openIndex(); - } - - /** - * If the user has specified a compression codec in the header metadata, - * create an instance of it. - */ - private void openCodec() throws IOException { - String codecName = header.getMetaBlock().getString( - MetaBlock.COMPRESSION_CODEC_KEY); - if (null != codecName) { - LOG.debug("Decompressing file with codec: " + codecName); - this.codec = CodecMap.getCodec(codecName, conf); - if (null != this.codec) { - this.decompressor = codec.createDecompressor(); - } - } - } - - /** - * Get the first index segment out of the file; determine - * where that is by loading the index locator at the end of - * the file. - */ - private void openIndex() throws IOException { - // Jump to the end of the file. - // At the end of the file is a RSM followed by two VLongs; - // the first of these is the value -2 (one byte) and the - // second of these is the offset of the beginning of the index (up to - // 9 bytes). - internalSeek(fileLen - RecordStartMark.START_MARK_LENGTH - 10); - - byte [] finaleBuffer = new byte[RecordStartMark.START_MARK_LENGTH + 10]; - this.dataIn.readFully(finaleBuffer); - - // Figure out where in the finaleBuffer the RSM actually starts, - // as the finale might not fully fill the finaleBuffer. - int rsmStart = findRecordStartMark(finaleBuffer); - if (-1 == rsmStart) { - throw new IOException( - "Corrupt file index; could not find index start offset."); - } - - // Wrap a buffer around those two vlongs. - int vlongStart = rsmStart + RecordStartMark.START_MARK_LENGTH; - DataInputBuffer inBuf = new DataInputBuffer(); - inBuf.reset(finaleBuffer, vlongStart, finaleBuffer.length - vlongStart); - - long offsetMarker = WritableUtils.readVLong(inBuf); - if (SEGMENT_OFFSET_ID != offsetMarker) { - // This isn't the correct signature; we got an RSM ahead of some - // other data. - throw new IOException("Invalid segment offset id: " + offsetMarker); - } - - // This will contain the position of the IndexTable. - long indexTableStart = WritableUtils.readVLong(inBuf); - LOG.debug("IndexTable begins at " + indexTableStart); - - readIndexTable(indexTableStart); - - // Set up to read records from the beginning of the file. This - // starts with the first IndexSegment. - curIndexSegmentId = 0; - loadIndexSegment(); - - // This has moved the file pointer all over but we don't need to - // worry about resetting it now. The next() method will seek the - // file pointer to the first record when the user is ready to - // consume it. - } - - /** - * Load the entire IndexTable into memory and decode it. - */ - private void readIndexTable(long indexTableOffset) throws IOException { - internalSeek(indexTableOffset); - - // Read the RecordStartMark ahead of the IndexTable. - this.dataIn.readFully(tmpRsmBuf); - if (!matchesRsm(tmpRsmBuf)) { - throw new IOException("Expected record start mark before IndexTable"); - } - - this.indexTable = new IndexTable(dataIn); - } - - /** - * Ingest the next IndexSegment. - */ - private void readNextIndexSegment() throws IOException { - this.curIndexSegmentId++; - loadIndexSegment(); - } - - /** - * Load curIndexSegment with the segment specified by curIndexSegmentId. - * The file pointer will be moved to the position after this segment. - * If the segment id does not exist, then the curIndexSegment will be - * set to null. - */ - private void loadIndexSegment() throws IOException { - if (indexTable.size() <= curIndexSegmentId || curIndexSegmentId < 0) { - // We've iterated past the last IndexSegment. Set this to null - // and return; the next() method will then return false. - this.curIndexSegment = null; - return; - } - - // Otherwise, seek to the segment and load it. - IndexTableEntry tableEntry = indexTable.get(curIndexSegmentId); - long segmentOffset = tableEntry.getSegmentOffset(); - internalSeek(segmentOffset); - readPositionedIndexSegment(); - } - - /** - * When the underlying stream is aligned on the RecordStartMark - * ahead of an IndexSegment, read in the next IndexSegment. - * After this method the curIndexSegment contains the next - * IndexSegment to read in the file; if the entire index has been - * read in this fastion, curIndexSegment will be null. - */ - private void readPositionedIndexSegment() throws IOException { - if (LOG.isDebugEnabled()) { - LOG.debug("Reading index segment at " + tell()); - } - - // Read the RecordStartMark ahead of the IndexSegment. - this.dataIn.readFully(tmpRsmBuf); - if (!matchesRsm(tmpRsmBuf)) { - throw new IOException("Expected record start mark before IndexSegment"); - } - - // Read the IndexSegment proper. - this.curIndexSegment = new IndexSegment( - this.indexTable.get(curIndexSegmentId), this.dataIn); - } - - /** - * @return true if the bytes in 'buf' starting at 'offset' match - * the RecordStartMark. - * @param rsm the RecordStartMark - * @param buf the buffer to check - * @param offset the offset into buf to begin checking. - */ - private boolean matchesRsm(byte [] rsm, byte [] buf, int offset) { - for (int i = 0; i < RecordStartMark.START_MARK_LENGTH; i++) { - if (buf[i + offset] != rsm[i]) { - return false; // Mismatch at position i. - } - } - - return true; // Matched the whole thing. - } - - private boolean matchesRsm(byte [] buf, int offset) { - return matchesRsm(this.header.getStartMark().getBytes(), - buf, offset); - } - - private boolean matchesRsm(byte [] buf) { - return matchesRsm(buf, 0); - } - - /** - * @return the offset in 'buf' where a RecordStartMark begins, or -1 - * if the RecordStartMark is not present in the buffer. - */ - private int findRecordStartMark(byte [] buf) { - byte [] rsm = this.header.getStartMark().getBytes(); - - for (int i = 0; i < buf.length; i++) { - if (matchesRsm(rsm, buf, i)) { - return i; - } - } - - return -1; // couldn't find it. - } - - @Override - /** {@inheritDoc} */ - public Path getPath() { - return this.path; - } - - @Override - /** {@inheritDoc} */ - public long tell() throws IOException { - checkForNull(this.underlyingInput); - return this.underlyingInput.getPos(); - } - - @Override - /** {@inheritDoc} */ - public void seek(long pos) throws IOException { - closeUserStream(); - checkForNull(this.underlyingInput); - this.isAligned = false; - searchForRecord(pos); - } - - /** - * Search the index for the first record starting on or after 'start'. - * @param start the offset in the file where we should start looking - * for a record. - */ - private void searchForRecord(long start) throws IOException { - LOG.debug("Looking for the first record at/after offset " + start); - - // Scan through the IndexTable until we find the IndexSegment - // that contains the offset. - for (int i = 0; i < indexTable.size(); i++) { - IndexTableEntry tableEntry = indexTable.get(i); - if (LOG.isDebugEnabled()) { - LOG.debug("Checking index table entry for range: " - + tableEntry.getFirstIndexOffset() + ", " - + tableEntry.getLastIndexOffset()); - } - - if (tableEntry.containsOffset(start)) { - // Seek to the IndexSegment associated with this tableEntry. - curIndexSegmentId = i; - loadIndexSegment(); - - // Use this index segment. The record index iterator - // is at the beginning of the IndexSegment, since we just - // read it in. - LOG.debug("Found matching index segment."); - while (this.curIndexSegment.next()) { - long curStart = this.curIndexSegment.getCurRecordStart(); - if (curStart >= start) { - LOG.debug("Found seek target record with offset " + curStart); - // This is the first record to meet this criterion. - // Rewind the index iterator by one so that the next() - // method will do the right thing. next() will also - // take care of actually seeking to the correct position - // in the file to read the record proper. - this.curIndexSegment.rewindOnce(); - return; - } - } - - // If it wasn't actually in this IndexSegment, then we've - // got a corrupt IndexTableEntry; the entry represented that - // the segment ran longer than it actually does. - throw new IOException("IndexTableEntry claims last offset of " - + tableEntry.getLastIndexOffset() - + " but IndexSegment ends early." - + " The IndexTable appears corrupt."); - } - } - - // If we didn't return inside the loop, then we've searched the entire - // file and it's not there. Advance the IndexSegment iterator to - // the end of the road so that next() returns false. - this.curIndexSegmentId = indexTable.size(); - loadIndexSegment(); - } - - /** - * Read data from the stream and discard it. - * @param numBytes number of bytes to read and discard. - */ - private void consumeBytes(int numBytes) throws IOException { - int remaining = numBytes; - while (remaining > 0) { - int received = dataIn.skipBytes(remaining); - if (received < 1) { - throw new IOException("Could not consume additional bytes"); - } - remaining -= received; - } - } - - /** - * Seek to position 'pos' (offset from start of file). If this - * is nearby, actually just consume data from the underlying - * stream rather than doing a real seek. - * @param targetPos the position to seek to, expressed as an offset - * from the start of the file. - */ - private void internalSeek(long targetPos) throws IOException { - long curPos = this.underlyingInput.getPos(); - LOG.debug("Internal seek: target=" + targetPos + "; cur=" + curPos); - long distance = targetPos - curPos; - if (targetPos == curPos) { - LOG.debug("(no motion required)"); - return; // We're already there! - } else if (targetPos > curPos && distance < MAX_CONSUMPTION_WIDTH) { - // We're "close enough" that we should just read it. - LOG.debug("Advancing by " + distance + " bytes."); - consumeBytes((int) distance); - } else { - LOG.debug("Direct seek to target"); - this.underlyingInput.seek(targetPos); - this.dataIn = new DataInputStream(this.underlyingInput); - } - } - - /** - * Close any stream to an open record that was opened by a user. - */ - private void closeUserStream() throws IOException { - if (this.userInputStream != null) { - this.userInputStream.close(); - this.userInputStream = null; - } - } - - @Override - /** {@inheritDoc} */ - public boolean next() throws IOException { - LOG.debug("Checking for next record"); - checkForNull(this.underlyingInput); - // If the user has opened a record stream, it is now void. - closeUserStream(); - this.isAligned = false; // false until proven true. - - // Get the position of the next record start. - // Check the index: is there another record? - if (null == curIndexSegment) { - LOG.debug("Index is finished; false"); - return false; // No index remains. Ergo, no more records. - } - boolean moreInSegment = curIndexSegment.next(); - if (!moreInSegment) { - // The current IndexSegment has been exhausted. Move to the next. - LOG.debug("Loading next index segment."); - readNextIndexSegment(); - if (null == curIndexSegment) { - LOG.debug("Index is finished; false"); - return false; // No index; no records. - } - - // Try again with the next IndexSegment. - moreInSegment = curIndexSegment.next(); - } - - if (!moreInSegment) { - // Nothing left in the last IndexSegment. - LOG.debug("Last index segment is finished; false."); - this.curIndexSegment = null; - return false; - } - - // Determine where the next record starts. - this.indexRecordLen = this.curIndexSegment.getCurRecordLen(); - this.curRecordOffset = this.curIndexSegment.getCurRecordStart(); - - LOG.debug("Next record starts at position: " + this.curRecordOffset - + "; indexedLen=" + this.indexRecordLen); - - // Make sure we're at the target position. - internalSeek(this.curRecordOffset); - - // We are now on top of the next record's RecordStartMark. - // Consume the RSM and the record header. - this.dataIn.readFully(this.tmpRsmBuf); - if (!matchesRsm(tmpRsmBuf)) { - // No rsm? No dice. - throw new IOException("Index contains bogus offset."); - } - - this.curEntryId = WritableUtils.readVLong(this.dataIn); - if (this.curEntryId < 0) { - // We've moved past the end of the records and started - // trying to consume the index. This is the EOF from - // the client's perspective. - LOG.debug("Indexed position is itself an IndexSegment; false."); - return false; - } - LOG.debug("Aligned on record id=" + this.curEntryId); - - this.claimedRecordLen = WritableUtils.readVLong(this.dataIn); - LOG.debug("Record has claimed length " + this.claimedRecordLen); - // We are now aligned on the start of the user's data. - this.isAligned = true; - return true; - } - - @Override - /** {@inheritDoc} */ - public boolean isRecordAvailable() { - return this.isAligned; - } - - @Override - /** {@inheritDoc} */ - public long getRecordLen() { - return this.claimedRecordLen; - } - - @Override - /** {@inheritDoc} */ - public long getRecordId() { - return this.curEntryId; - } - - @Override - /** {@inheritDoc} */ - public long getRecordOffset() { - return this.curRecordOffset; - } - - @Override - /** {@inheritDoc} */ - public InputStream readBlobRecord() throws IOException { - if (!isRecordAvailable()) { - // we're not currently aligned on a record-start. - // Try to get the next one. - if (!next()) { - // No more records available. - throw new EOFException("End of file reached."); - } - } - - // Ensure any previously-open user record stream is closed. - closeUserStream(); - - // Mark this record as consumed. - this.isAligned = false; - - // The length of the stream we can return to the user is - // the indexRecordLen minus the length of any per-record headers. - // That includes the RecordStartMark, the entryId, and the claimedLen. - long streamLen = this.indexRecordLen - RecordStartMark.START_MARK_LENGTH - - WritableUtils.getVIntSize(this.curEntryId) - - WritableUtils.getVIntSize(this.claimedRecordLen); - LOG.debug("Yielding stream to user with length " + streamLen); - this.userInputStream = new FixedLengthInputStream(this.dataIn, streamLen); - if (this.codec != null) { - // The user needs to decompress the data; wrap the InputStream. - decompressor.reset(); - this.userInputStream = new DecompressorStream( - this.userInputStream, decompressor); - } - return this.userInputStream; - } - - @Override - /** {@inheritDoc} */ - public java.io.Reader readClobRecord() throws IOException { - // Get a handle to the binary reader and then wrap it. - InputStream is = readBlobRecord(); - return new InputStreamReader(is); - } - - @Override - /** {@inheritDoc} */ - public void close() throws IOException { - closeUserStream(); - - if (null != dataIn) { - dataIn.close(); - dataIn = null; - } - - if (null != underlyingInput) { - underlyingInput.close(); - underlyingInput = null; - } - - this.isAligned = false; - } - - @Override - /** {@inheritDoc} */ - public boolean isClosed() { - return this.underlyingInput == null; - } + public abstract static class Reader + extends org.apache.sqoop.io.LobFile.Reader { } /** * Creates a LobFile Reader configured to read from the specified file. */ public static Reader open(Path p, Configuration conf) throws IOException { - FileSystem fs = p.getFileSystem(conf); - FileStatus [] stats = fs.listStatus(p); - if (null == stats || stats.length == 0) { - throw new IOException("Could not find file: " + p); - } - FSDataInputStream fis = fs.open(p); - DataInputStream dis = new DataInputStream(fis); - LobFileHeader header = new LobFileHeader(dis); - int version = header.getVersion(); - - if (version == 0) { - return new V0Reader(p, conf, header, dis, fis, stats[0].getLen()); - } else { - throw new IOException("No reader available for LobFile version " - + version); - } + return org.apache.sqoop.io.LobFile.open(p, conf); } /** @@ -1775,7 +87,7 @@ public static Reader open(Path p, Configuration conf) throws IOException { * @param conf the configuration to use to interact with the filesystem. */ public static Writer create(Path p, Configuration conf) throws IOException { - return create(p, conf, false); + return org.apache.sqoop.io.LobFile.create(p, conf, false); } /** @@ -1786,7 +98,7 @@ public static Writer create(Path p, Configuration conf) throws IOException { */ public static Writer create(Path p, Configuration conf, boolean isCharData) throws IOException { - return create(p, conf, isCharData, null); + return org.apache.sqoop.io.LobFile.create(p, conf, isCharData, null); } /** @@ -1798,8 +110,7 @@ public static Writer create(Path p, Configuration conf, boolean isCharData) */ public static Writer create(Path p, Configuration conf, boolean isCharData, String codec) throws IOException { - return create(p, conf, isCharData, codec, - V0Writer.DEFAULT_MAX_SEGMENT_ENTRIES); + return org.apache.sqoop.io.LobFile.create(p, conf, isCharData, codec); } /** @@ -1813,7 +124,8 @@ public static Writer create(Path p, Configuration conf, boolean isCharData, public static Writer create(Path p, Configuration conf, boolean isCharData, String codec, int entriesPerSegment) throws IOException { - return new V0Writer(p, conf, isCharData, codec, entriesPerSegment); + return org.apache.sqoop.io.LobFile.create( + p, conf, isCharData, codec, entriesPerSegment); } } diff --git a/src/java/com/cloudera/sqoop/io/LobReaderCache.java b/src/java/com/cloudera/sqoop/io/LobReaderCache.java index 94892374..3394296d 100644 --- a/src/java/com/cloudera/sqoop/io/LobReaderCache.java +++ b/src/java/com/cloudera/sqoop/io/LobReaderCache.java @@ -1,6 +1,4 @@ /** - * Copyright 2011 The Apache Software Foundation - * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -17,19 +15,13 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - - package com.cloudera.sqoop.io; import java.io.IOException; -import java.util.Map; -import java.util.TreeMap; import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; /** * A cache of open LobFile.Reader objects. @@ -38,17 +30,13 @@ * instances, it is most useful to have a single global cache. This cache is * internally synchronized; only one thread can insert or retrieve a reader * from the cache at a time. + * + * @deprecated use org.apache.sqoop.io.LobReaderCache instead. + * @see org.apache.sqoop.io.LobReaderCache */ -public final class LobReaderCache { +public final class LobReaderCache extends org.apache.sqoop.io.LobReaderCache { - public static final Log LOG = LogFactory.getLog( - LobReaderCache.class.getName()); - - private Map readerMap; - - private LobReaderCache() { - this.readerMap = new TreeMap(); - } + public static final Log LOG = org.apache.sqoop.io.LobReaderCache.LOG; private static final LobReaderCache CACHE; static { @@ -71,79 +59,7 @@ public static LobReaderCache getCache() { */ public static Path qualify(Path path, Configuration conf) throws IOException { - if (null == path) { - return null; - } - - FileSystem fs = path.getFileSystem(conf); - if (null == fs) { - fs = FileSystem.get(conf); - } - return path.makeQualified(fs); - } - - /** - * Open a LobFile for read access, returning a cached reader if one is - * available, or a new reader otherwise. - * @param path the path to the LobFile to open - * @param conf the configuration to use to access the FS. - * @throws IOException if there's an error opening the file. - */ - public LobFile.Reader get(Path path, Configuration conf) - throws IOException { - - LobFile.Reader reader = null; - Path canonicalPath = qualify(path, conf); - // Look up an entry in the cache. - synchronized(this) { - reader = readerMap.remove(canonicalPath); - } - - if (null != reader && !reader.isClosed()) { - // Cache hit. return it. - LOG.debug("Using cached reader for " + canonicalPath); - return reader; - } - - // Cache miss; open the file. - LOG.debug("No cached reader available for " + canonicalPath); - return LobFile.open(path, conf); - } - - /** - * Return a reader back to the cache. If there's already a reader for - * this path, then the current reader is closed. - * @param reader the opened reader. Any record-specific subreaders should be - * closed. - * @throws IOException if there's an error accessing the path's filesystem. - */ - public void recycle(LobFile.Reader reader) throws IOException { - Path canonicalPath = reader.getPath(); - - // Check if the cache has a reader for this path already. If not, add this. - boolean cached = false; - synchronized(this) { - if (readerMap.get(canonicalPath) == null) { - LOG.debug("Caching reader for path: " + canonicalPath); - readerMap.put(canonicalPath, reader); - cached = true; - } - } - - if (!cached) { - LOG.debug("Reader already present for path: " + canonicalPath - + "; closing."); - reader.close(); - } - } - - @Override - protected synchronized void finalize() throws Throwable { - for (LobFile.Reader r : readerMap.values()) { - r.close(); - } - - super.finalize(); + return org.apache.sqoop.io.LobReaderCache.qualify(path, conf); } } diff --git a/src/java/com/cloudera/sqoop/io/NamedFifo.java b/src/java/com/cloudera/sqoop/io/NamedFifo.java index 9a9f6d08..e27d2c4f 100644 --- a/src/java/com/cloudera/sqoop/io/NamedFifo.java +++ b/src/java/com/cloudera/sqoop/io/NamedFifo.java @@ -1,6 +1,4 @@ /** - * Copyright 2011 The Apache Software Foundation - * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -17,82 +15,24 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package com.cloudera.sqoop.io; import java.io.File; -import java.io.IOException; - -import org.apache.hadoop.util.Shell; -import org.apache.log4j.Logger; /** * A named FIFO channel. + * + * @deprecated use org.apache.sqoop.io.NamedFifo instead. + * @see org.apache.sqoop.io.NamedFifo */ -public class NamedFifo { +public class NamedFifo extends org.apache.sqoop.io.NamedFifo { - private static final Logger LOG = Logger.getLogger(NamedFifo.class); - - private File fifoFile; - - /** Create a named FIFO object at the local fs path given by 'pathname'. */ public NamedFifo(String pathname) { - this.fifoFile = new File(pathname); + super(pathname); } - /** Create a named FIFO object at the local fs path given by the 'fifo' File - * object. */ public NamedFifo(File fifo) { - this.fifoFile = fifo; - } - - /** - * Return the File object representing the FIFO. - */ - public File getFile() { - return this.fifoFile; - } - - /** - * Create a named FIFO object. - * The pipe will be created with permissions 0600. - * @throws IOException on failure. - */ - public void create() throws IOException { - create(0600); - } - - /** - * Create a named FIFO object with the specified fs permissions. - * This depends on the 'mknod' or 'mkfifo' (Mac OS X) system utility - * existing. (for example, provided by Linux coreutils). This object - * will be deleted when the process exits. - * @throws IOException on failure. - */ - public void create(int permissions) throws IOException { - String filename = fifoFile.toString(); - - // Format permissions as a mode string in base 8. - String modeStr = Integer.toString(permissions, 8); - - // Create the FIFO itself. - try { - String output = Shell.execCommand("mknod", "--mode=0" + modeStr, - filename, "p"); - LOG.info("mknod output:\n"+output); - } catch (IOException ex) { - LOG.info("IO error running mknod: " + ex.getMessage()); - LOG.debug("IO error running mknod", ex); - } - if (!this.fifoFile.exists()) { - LOG.info("mknod failed, falling back to mkfifo"); - String output = Shell.execCommand("mkfifo", "-m", "0" + modeStr, - filename); - LOG.info("mkfifo output:\n"+output); - } - - // Schedule the FIFO to be cleaned up when we exit. - this.fifoFile.deleteOnExit(); + super(fifo); } } diff --git a/src/java/com/cloudera/sqoop/io/SplittableBufferedWriter.java b/src/java/com/cloudera/sqoop/io/SplittableBufferedWriter.java index 1ff0422b..9bd533df 100644 --- a/src/java/com/cloudera/sqoop/io/SplittableBufferedWriter.java +++ b/src/java/com/cloudera/sqoop/io/SplittableBufferedWriter.java @@ -1,6 +1,4 @@ /** - * Copyright 2011 The Apache Software Foundation - * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -20,56 +18,27 @@ package com.cloudera.sqoop.io; -import java.io.BufferedWriter; -import java.io.OutputStreamWriter; -import java.io.IOException; +import org.apache.sqoop.io.SplittingOutputStream; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; /** * A BufferedWriter implementation that wraps around a SplittingOutputStream * and allows splitting of the underlying stream. * Splits occur at allowSplit() calls, or newLine() calls. + * + * @deprecated use org.apache.sqoop.io.SplittableBufferedWriter instead. + * @see org.apache.sqoop.io.SplittableBufferedWriter */ -public class SplittableBufferedWriter extends BufferedWriter { - - public static final Log LOG = LogFactory.getLog( - SplittableBufferedWriter.class.getName()); - - private SplittingOutputStream splitOutputStream; - private boolean alwaysFlush; +public class SplittableBufferedWriter + extends org.apache.sqoop.io.SplittableBufferedWriter { public SplittableBufferedWriter( final SplittingOutputStream splitOutputStream) { - super(new OutputStreamWriter(splitOutputStream)); - - this.splitOutputStream = splitOutputStream; - this.alwaysFlush = false; + super(splitOutputStream); } - /** For testing. */ SplittableBufferedWriter(final SplittingOutputStream splitOutputStream, final boolean alwaysFlush) { - super(new OutputStreamWriter(splitOutputStream)); - - this.splitOutputStream = splitOutputStream; - this.alwaysFlush = alwaysFlush; - } - - public void newLine() throws IOException { - super.newLine(); - this.allowSplit(); - } - - public void allowSplit() throws IOException { - if (alwaysFlush) { - this.flush(); - } - if (this.splitOutputStream.wouldSplit()) { - LOG.debug("Starting new split"); - this.flush(); - this.splitOutputStream.allowSplit(); - } + super(splitOutputStream, alwaysFlush); } } diff --git a/src/java/com/cloudera/sqoop/io/SplittingOutputStream.java b/src/java/com/cloudera/sqoop/io/SplittingOutputStream.java index 7ad262d5..ab81042b 100644 --- a/src/java/com/cloudera/sqoop/io/SplittingOutputStream.java +++ b/src/java/com/cloudera/sqoop/io/SplittingOutputStream.java @@ -1,6 +1,4 @@ /** - * Copyright 2011 The Apache Software Foundation - * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -17,19 +15,12 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package com.cloudera.sqoop.io; -import java.io.OutputStream; import java.io.IOException; -import java.util.Formatter; -import org.apache.commons.io.output.CountingOutputStream; import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; - import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.compress.CompressionCodec; @@ -37,127 +28,18 @@ * An output stream that writes to an underlying filesystem, opening * a new file after a specified number of bytes have been written to the * current one. + * + * @deprecated use org.apache.sqoop.io.SplittingOutputStream instead. + * @see org.apache.sqoop.io.SplittingOutputStream */ -public class SplittingOutputStream extends OutputStream { +public class SplittingOutputStream + extends org.apache.sqoop.io.SplittingOutputStream { - public static final Log LOG = LogFactory.getLog( - SplittingOutputStream.class.getName()); + public static final Log LOG = org.apache.sqoop.io.SplittingOutputStream.LOG; - private OutputStream writeStream; - private CountingOutputStream countingFilterStream; - private Configuration conf; - private Path destDir; - private String filePrefix; - private long cutoffBytes; - private CompressionCodec codec; - private int fileNum; - - /** - * Create a new SplittingOutputStream. - * @param conf the Configuration to use to interface with HDFS - * @param destDir the directory where the files will go (should already - * exist). - * @param filePrefix the first part of the filename, which will be appended - * by a number. This file will be placed inside destDir. - * @param cutoff the approximate number of bytes to use per file - * @param doGzip if true, then output files will be gzipped and have a .gz - * suffix. - */ public SplittingOutputStream(final Configuration conf, final Path destDir, final String filePrefix, final long cutoff, final CompressionCodec codec) throws IOException { - - this.conf = conf; - this.destDir = destDir; - this.filePrefix = filePrefix; - this.cutoffBytes = cutoff; - if (this.cutoffBytes < 0) { - this.cutoffBytes = 0; // splitting disabled. - } - this.codec = codec; - this.fileNum = 0; - - openNextFile(); - } - - /** Initialize the OutputStream to the next file to write to. - */ - private void openNextFile() throws IOException { - FileSystem fs = FileSystem.get(conf); - - StringBuffer sb = new StringBuffer(); - Formatter fmt = new Formatter(sb); - fmt.format("%05d", this.fileNum++); - String filename = filePrefix + fmt.toString(); - if (codec != null) { - filename = filename + codec.getDefaultExtension(); - } - Path destFile = new Path(destDir, filename); - LOG.debug("Opening next output file: " + destFile); - if (fs.exists(destFile)) { - Path canonicalDest = destFile.makeQualified(fs); - throw new IOException("Destination file " + canonicalDest - + " already exists"); - } - - OutputStream fsOut = fs.create(destFile); - - // Count how many actual bytes hit HDFS. - this.countingFilterStream = new CountingOutputStream(fsOut); - - if (codec != null) { - // Wrap that in a compressing stream. - this.writeStream = codec.createOutputStream(this.countingFilterStream); - } else { - // Write to the counting stream directly. - this.writeStream = this.countingFilterStream; - } - } - - /** - * @return true if allowSplit() would actually cause a split. - */ - public boolean wouldSplit() { - return this.cutoffBytes > 0 - && this.countingFilterStream.getByteCount() >= this.cutoffBytes; - } - - /** If we've written more to the disk than the user's split size, - * open the next file. - */ - private void checkForNextFile() throws IOException { - if (wouldSplit()) { - LOG.debug("Starting new split"); - this.writeStream.flush(); - this.writeStream.close(); - openNextFile(); - } - } - - /** Defines a point in the stream when it is acceptable to split to a new - file; e.g., the end of a record. - */ - public void allowSplit() throws IOException { - checkForNextFile(); - } - - public void close() throws IOException { - this.writeStream.close(); - } - - public void flush() throws IOException { - this.writeStream.flush(); - } - - public void write(byte [] b) throws IOException { - this.writeStream.write(b); - } - - public void write(byte [] b, int off, int len) throws IOException { - this.writeStream.write(b, off, len); - } - - public void write(int b) throws IOException { - this.writeStream.write(b); + super(conf, destDir, filePrefix, cutoff, codec); } } diff --git a/src/java/com/cloudera/sqoop/io/UnsupportedCodecException.java b/src/java/com/cloudera/sqoop/io/UnsupportedCodecException.java index b9ee728e..4d8225a3 100644 --- a/src/java/com/cloudera/sqoop/io/UnsupportedCodecException.java +++ b/src/java/com/cloudera/sqoop/io/UnsupportedCodecException.java @@ -1,6 +1,4 @@ /** - * Copyright 2011 The Apache Software Foundation - * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -17,15 +15,18 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package com.cloudera.sqoop.io; -import java.io.IOException; /** * Thrown when a compression codec cannot be recognized. + * + * @deprecated use org.apache.sqoop.io.UnsupportedCodecException instead. + * @see org.apache.sqoop.io.UnsupportedCodecException */ -public class UnsupportedCodecException extends IOException { +public class UnsupportedCodecException + extends org.apache.sqoop.io.UnsupportedCodecException { + public UnsupportedCodecException() { super("UnsupportedCodecException"); } diff --git a/src/java/com/cloudera/sqoop/lib/BigDecimalSerializer.java b/src/java/com/cloudera/sqoop/lib/BigDecimalSerializer.java index a772608b..2ae89c24 100644 --- a/src/java/com/cloudera/sqoop/lib/BigDecimalSerializer.java +++ b/src/java/com/cloudera/sqoop/lib/BigDecimalSerializer.java @@ -1,6 +1,4 @@ /** - * Copyright 2011 The Apache Software Foundation - * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -17,7 +15,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package com.cloudera.sqoop.lib; import java.io.DataInput; @@ -26,8 +23,6 @@ import java.math.BigDecimal; import java.math.BigInteger; -import org.apache.hadoop.io.Text; - /** * Serialize BigDecimal classes to/from DataInput and DataOutput objects. * @@ -43,43 +38,24 @@ * [int: scale][boolean: b == true][string: BigInt-part.toString()] * * TODO(aaron): Get this to work with Hadoop's Serializations framework. + * + * @deprecated use org.apache.sqoop.lib.BigDecimalSerializer instead. + * @see org.apache.sqoop.lib.BigDecimalSerializer */ public final class BigDecimalSerializer { private BigDecimalSerializer() { } static final BigInteger LONG_MAX_AS_BIGINT = - BigInteger.valueOf(Long.MAX_VALUE); + org.apache.sqoop.lib.BigDecimalSerializer.LONG_MAX_AS_BIGINT; static final BigInteger LONG_MIN_AS_BIGINT = - BigInteger.valueOf(Long.MIN_VALUE); + org.apache.sqoop.lib.BigDecimalSerializer.LONG_MIN_AS_BIGINT; public static void write(BigDecimal d, DataOutput out) throws IOException { - int scale = d.scale(); - BigInteger bigIntPart = d.unscaledValue(); - boolean fastpath = bigIntPart.compareTo(LONG_MAX_AS_BIGINT) < 0 - && bigIntPart .compareTo(LONG_MIN_AS_BIGINT) > 0; - - out.writeInt(scale); - out.writeBoolean(fastpath); - if (fastpath) { - out.writeLong(bigIntPart.longValue()); - } else { - Text.writeString(out, bigIntPart.toString()); - } + org.apache.sqoop.lib.BigDecimalSerializer.write(d, out); } public static BigDecimal readFields(DataInput in) throws IOException { - int scale = in.readInt(); - boolean fastpath = in.readBoolean(); - BigInteger unscaledIntPart; - if (fastpath) { - long unscaledValue = in.readLong(); - unscaledIntPart = BigInteger.valueOf(unscaledValue); - } else { - String unscaledValueStr = Text.readString(in); - unscaledIntPart = new BigInteger(unscaledValueStr); - } - - return new BigDecimal(unscaledIntPart, scale); + return org.apache.sqoop.lib.BigDecimalSerializer.readFields(in); } } diff --git a/src/java/com/cloudera/sqoop/lib/BlobRef.java b/src/java/com/cloudera/sqoop/lib/BlobRef.java index 76b79478..b3d5341e 100644 --- a/src/java/com/cloudera/sqoop/lib/BlobRef.java +++ b/src/java/com/cloudera/sqoop/lib/BlobRef.java @@ -1,6 +1,4 @@ /** - * Copyright 2011 The Apache Software Foundation - * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -17,37 +15,27 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package com.cloudera.sqoop.lib; -import java.io.ByteArrayInputStream; -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; -import java.io.InputStream; -import java.util.Arrays; -import java.util.regex.Matcher; - -import org.apache.hadoop.io.BytesWritable; -import com.cloudera.sqoop.io.LobFile; - import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; /** * BlobRef is a wrapper that holds a BLOB either directly, or a * reference to a file that holds the BLOB data. + * + * @deprecated use org.apache.sqoop.lib.BlobRef instead. + * @see org.apache.sqoop.lib.BlobRef */ -public class BlobRef extends LobRef { +public class BlobRef extends org.apache.sqoop.lib.BlobRef { - public static final Log LOG = LogFactory.getLog(BlobRef.class.getName()); + public static final Log LOG = org.apache.sqoop.lib.BlobRef.LOG; public BlobRef() { super(); } public BlobRef(byte [] bytes) { - super(new BytesWritable(bytes)); + super(bytes); } /** @@ -60,45 +48,6 @@ public BlobRef(String file, long offset, long length) { super(file, offset, length); } - @Override - protected InputStream getExternalSource(LobFile.Reader reader) - throws IOException { - return reader.readBlobRecord(); - } - - @Override - protected InputStream getInternalSource(BytesWritable data) { - return new ByteArrayInputStream(data.getBytes(), 0, data.getLength()); - } - - @Override - protected byte [] getInternalData(BytesWritable data) { - return Arrays.copyOf(data.getBytes(), data.getLength()); - } - - @Override - protected BytesWritable deepCopyData(BytesWritable data) { - return new BytesWritable(Arrays.copyOf(data.getBytes(), data.getLength())); - } - - @Override - public void readFieldsInternal(DataInput in) throws IOException { - // For internally-stored BLOBs, the data is a BytesWritable - // containing the actual data. - - BytesWritable data = getDataObj(); - - if (null == data) { - data = new BytesWritable(); - } - data.readFields(in); - setDataObj(data); - } - - @Override - public void writeInternal(DataOutput out) throws IOException { - getDataObj().write(out); - } /** * Create a BlobRef based on parsed data from a line of text. @@ -110,24 +59,7 @@ public void writeInternal(DataOutput out) throws IOException { * an empty BlobRef if the data to be parsed is actually inline. */ public static BlobRef parse(String inputString) { - // If inputString is of the form 'externalLob(lf,%s,%d,%d)', then this is - // an external BLOB stored at the LobFile indicated by '%s' with the next - // two arguments representing its offset and length in the file. - // Otherwise, it is an inline BLOB, which we don't support parsing of. - - Matcher m = EXTERNAL_MATCHER.get(); - m.reset(inputString); - if (m.matches()) { - // This is a LobFile. Extract the filename, offset and len from the - // matcher. - return new BlobRef(m.group(1), Long.valueOf(m.group(2)), - Long.valueOf(m.group(3))); - } else { - // This is inline BLOB string data. - LOG.warn( - "Reparsing inline BLOB data is not supported; use SequenceFiles."); - return new BlobRef(); - } + return org.apache.sqoop.lib.BlobRef.parse(inputString); } } diff --git a/src/java/com/cloudera/sqoop/lib/BooleanParser.java b/src/java/com/cloudera/sqoop/lib/BooleanParser.java index bb34e1f6..ab97cf0a 100644 --- a/src/java/com/cloudera/sqoop/lib/BooleanParser.java +++ b/src/java/com/cloudera/sqoop/lib/BooleanParser.java @@ -1,6 +1,4 @@ /** - * Copyright 2011 The Apache Software Foundation - * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -23,6 +21,8 @@ /** * Parse string representations of boolean values into boolean * scalar types. + * @deprecated use org.apache.sqoop.lib.BooleanParser instead. + * @see org.apache.sqoop.lib.BooleanParser */ public final class BooleanParser { private BooleanParser() { @@ -37,9 +37,7 @@ private BooleanParser() { *

All comparisons are case-insensitive.

*/ public static boolean valueOf(final String s) { - return s != null && ("true".equalsIgnoreCase(s) || "t".equalsIgnoreCase(s) - || "1".equals(s) || "on".equalsIgnoreCase(s) - || "yes".equalsIgnoreCase(s)); + return org.apache.sqoop.lib.BooleanParser.valueOf(s); } } diff --git a/src/java/com/cloudera/sqoop/lib/ClobRef.java b/src/java/com/cloudera/sqoop/lib/ClobRef.java index 1fd20147..a328f23b 100644 --- a/src/java/com/cloudera/sqoop/lib/ClobRef.java +++ b/src/java/com/cloudera/sqoop/lib/ClobRef.java @@ -1,6 +1,4 @@ /** - * Copyright 2011 The Apache Software Foundation - * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -20,21 +18,14 @@ package com.cloudera.sqoop.lib; -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; -import java.io.Reader; -import java.io.StringReader; -import java.util.regex.Matcher; - -import org.apache.hadoop.io.Text; -import com.cloudera.sqoop.io.LobFile; - /** * ClobRef is a wrapper that holds a CLOB either directly, or a * reference to a file that holds the CLOB data. + * + * @deprecated use org.apache.sqoop.lib.ClobRef instead. + * @see org.apache.sqoop.lib.ClobRef */ -public class ClobRef extends LobRef { +public class ClobRef extends org.apache.sqoop.lib.ClobRef { public ClobRef() { super(); @@ -54,60 +45,13 @@ public ClobRef(String file, long offset, long length) { super(file, offset, length); } - @Override - protected Reader getExternalSource(LobFile.Reader reader) - throws IOException { - return reader.readClobRecord(); - } - - @Override - protected Reader getInternalSource(String data) { - return new StringReader(data); - } - - @Override - protected String deepCopyData(String data) { - return data; - } - - @Override - protected String getInternalData(String data) { - return data; - } - - @Override - public void readFieldsInternal(DataInput in) throws IOException { - // For internally-stored clobs, the data is written as UTF8 Text. - setDataObj(Text.readString(in)); - } - - @Override - public void writeInternal(DataOutput out) throws IOException { - Text.writeString(out, getDataObj()); - } - /** * Create a ClobRef based on parsed data from a line of text. * @param inputString the text-based input data to parse. * @return a ClobRef to the given data. */ public static ClobRef parse(String inputString) { - // If inputString is of the form 'externalLob(lf,%s,%d,%d)', then this is - // an external CLOB stored at the LobFile indicated by '%s' with the next - // two arguments representing its offset and length in the file. - // Otherwise, it is an inline CLOB, which we read as-is. - - Matcher m = EXTERNAL_MATCHER.get(); - m.reset(inputString); - if (m.matches()) { - // This is a LobFile. Extract the filename, offset and len from the - // matcher. - return new ClobRef(m.group(1), Long.valueOf(m.group(2)), - Long.valueOf(m.group(3))); - } else { - // This is inline CLOB string data. - return new ClobRef(inputString); - } + return org.apache.sqoop.lib.ClobRef.parse(inputString); } } diff --git a/src/java/com/cloudera/sqoop/lib/DelimiterSet.java b/src/java/com/cloudera/sqoop/lib/DelimiterSet.java index 25dbcfdc..6de90ad9 100644 --- a/src/java/com/cloudera/sqoop/lib/DelimiterSet.java +++ b/src/java/com/cloudera/sqoop/lib/DelimiterSet.java @@ -1,6 +1,4 @@ /** - * Copyright 2011 The Apache Software Foundation - * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -17,33 +15,24 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package com.cloudera.sqoop.lib; /** * Encapsulates a set of delimiters used to encode a record. + * @deprecated use org.apache.sqoop.lib.DelimiterSet instead. + * @see org.apache.sqoop.lib.DelimiterSet */ -public class DelimiterSet implements Cloneable { +public class DelimiterSet extends org.apache.sqoop.lib.DelimiterSet { - public static final char NULL_CHAR = '\000'; - - private char fieldDelim; // fields terminated by this. - private char recordDelim; // records terminated by this. - - // If these next two fields are '\000', then they are ignored. - private char enclosedBy; - private char escapedBy; - - // If true, then the enclosed-by character is applied to every - // field, not just ones containing embedded delimiters. - private boolean encloseRequired; + public static final char NULL_CHAR = + org.apache.sqoop.lib.DelimiterSet.NULL_CHAR; /** * Create a delimiter set with the default delimiters * (comma for fields, newline for records). */ public DelimiterSet() { - this(',', '\n', NULL_CHAR, NULL_CHAR, false); + super(); } /** @@ -57,152 +46,7 @@ public DelimiterSet() { */ public DelimiterSet(char field, char record, char enclose, char escape, boolean isEncloseRequired) { - this.fieldDelim = field; - this.recordDelim = record; - this.enclosedBy = enclose; - this.escapedBy = escape; - this.encloseRequired = isEncloseRequired; - } - - /** - * Sets the fields-terminated-by character. - */ - public void setFieldsTerminatedBy(char f) { - this.fieldDelim = f; - } - - /** - * @return the fields-terminated-by character. - */ - public char getFieldsTerminatedBy() { - return this.fieldDelim; - } - - /** - * Sets the end-of-record lines-terminated-by character. - */ - public void setLinesTerminatedBy(char r) { - this.recordDelim = r; - } - - /** - * @return the end-of-record (lines-terminated-by) character. - */ - public char getLinesTerminatedBy() { - return this.recordDelim; - } - - /** - * Sets the enclosed-by character. - * @param e the enclosed-by character, or '\000' for no enclosing character. - */ - public void setEnclosedBy(char e) { - this.enclosedBy = e; - } - - /** - * @return the enclosed-by character, or '\000' for none. - */ - public char getEnclosedBy() { - return this.enclosedBy; - } - - /** - * Sets the escaped-by character. - * @param e the escaped-by character, or '\000' for no escape character. - */ - public void setEscapedBy(char e) { - this.escapedBy = e; - } - - /** - * @return the escaped-by character, or '\000' for none. - */ - public char getEscapedBy() { - return this.escapedBy; - } - - /** - * Set whether the enclosed-by character must be applied to all fields, - * or only fields with embedded delimiters. - */ - public void setEncloseRequired(boolean required) { - this.encloseRequired = required; - } - - /** - * @return true if the enclosed-by character must be applied to all fields, - * or false if it's only used for fields with embedded delimiters. - */ - public boolean isEncloseRequired() { - return this.encloseRequired; - } - - @Override - /** - * @return a string representation of the delimiters. - */ - public String toString() { - return "fields=" + this.fieldDelim - + " records=" + this.recordDelim - + " escape=" + this.escapedBy - + " enclose=" + this.enclosedBy - + " required=" + this.encloseRequired; - } - - /** - * Format this set of delimiters as a call to the constructor for - * this object, that would generate identical delimiters. - * @return a String that can be embedded in generated code that - * provides this set of delimiters. - */ - public String formatConstructor() { - return "new DelimiterSet((char) " + (int) this.fieldDelim + ", " - + "(char) " + (int) this.recordDelim + ", " - + "(char) " + (int) this.enclosedBy + ", " - + "(char) " + (int) this.escapedBy + ", " - + this.encloseRequired + ")"; - } - - @Override - /** - * @return a hash code for this set of delimiters. - */ - public int hashCode() { - return (int) this.fieldDelim - + (((int) this.recordDelim) << 4) - + (((int) this.escapedBy) << 8) - + (((int) this.enclosedBy) << 12) - + (((int) this.recordDelim) << 16) - + (this.encloseRequired ? 0xFEFE : 0x7070); - } - - @Override - /** - * @return true if this delimiter set is the same as another set of - * delimiters. - */ - public boolean equals(Object other) { - if (null == other) { - return false; - } else if (!other.getClass().equals(getClass())) { - return false; - } - - DelimiterSet set = (DelimiterSet) other; - return this.fieldDelim == set.fieldDelim - && this.recordDelim == set.recordDelim - && this.escapedBy == set.escapedBy - && this.enclosedBy == set.enclosedBy - && this.encloseRequired == set.encloseRequired; - } - - @Override - /** - * @return a new copy of this same set of delimiters. - */ - public Object clone() throws CloneNotSupportedException { - return super.clone(); + super(field, record, enclose, escape, isEncloseRequired); } /** diff --git a/src/java/com/cloudera/sqoop/lib/FieldFormatter.java b/src/java/com/cloudera/sqoop/lib/FieldFormatter.java index d19c1426..45fb81f4 100644 --- a/src/java/com/cloudera/sqoop/lib/FieldFormatter.java +++ b/src/java/com/cloudera/sqoop/lib/FieldFormatter.java @@ -1,6 +1,4 @@ /** - * Copyright 2011 The Apache Software Foundation - * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -17,11 +15,13 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package com.cloudera.sqoop.lib; /** * Static helper class that will help format data with quotes and escape chars. + * + * @deprecated use org.apache.sqoop.lib.FieldFormatter instead. + * @see org.apache.sqoop.lib.FieldFormatter */ public final class FieldFormatter { @@ -35,7 +35,8 @@ private FieldFormatter() { } */ public static String hiveStringDropDelims(String str, DelimiterSet delimiters) { - return hiveStringReplaceDelims(str, "", delimiters); + return org.apache.sqoop.lib.FieldFormatter.hiveStringDropDelims( + str, delimiters); } /** @@ -47,8 +48,8 @@ public static String hiveStringDropDelims(String str, */ public static String hiveStringReplaceDelims(String str, String replacement, DelimiterSet delimiters) { - String droppedDelims = str.replaceAll("\\n|\\r|\01", replacement); - return escapeAndEnclose(droppedDelims, delimiters); + return org.apache.sqoop.lib.FieldFormatter.hiveStringReplaceDelims( + str, replacement, delimiters); } /** @@ -73,68 +74,7 @@ public static String hiveStringReplaceDelims(String str, String replacement, * @return the escaped, enclosed version of 'str'. */ public static String escapeAndEnclose(String str, DelimiterSet delimiters) { - - char escape = delimiters.getEscapedBy(); - char enclose = delimiters.getEnclosedBy(); - boolean encloseRequired = delimiters.isEncloseRequired(); - - // true if we can use an escape character. - boolean escapingLegal = DelimiterSet.NULL_CHAR != escape; - String withEscapes; - - if (null == str) { - return null; - } - - if (escapingLegal) { - // escaping is legal. Escape any instances of the escape char itself. - withEscapes = str.replace("" + escape, "" + escape + escape); - } else { - // no need to double-escape - withEscapes = str; - } - - if (DelimiterSet.NULL_CHAR == enclose) { - // The enclose-with character was left unset, so we can't enclose items. - - if (escapingLegal) { - // If the user has used the fields-terminated-by or - // lines-terminated-by characters in the string, escape them if we - // have an escape character. - String fields = "" + delimiters.getFieldsTerminatedBy(); - String lines = "" + delimiters.getLinesTerminatedBy(); - withEscapes = withEscapes.replace(fields, "" + escape + fields); - withEscapes = withEscapes.replace(lines, "" + escape + lines); - } - - // No enclosing possible, so now return this. - return withEscapes; - } - - // if we have an enclosing character, and escaping is legal, then the - // encloser must always be escaped. - if (escapingLegal) { - withEscapes = withEscapes.replace("" + enclose, "" + escape + enclose); - } - - boolean actuallyDoEnclose = encloseRequired; - if (!actuallyDoEnclose) { - // check if the string requires enclosing. - char [] mustEncloseFor = new char[2]; - mustEncloseFor[0] = delimiters.getFieldsTerminatedBy(); - mustEncloseFor[1] = delimiters.getLinesTerminatedBy(); - for (char reason : mustEncloseFor) { - if (str.indexOf(reason) != -1) { - actuallyDoEnclose = true; - break; - } - } - } - - if (actuallyDoEnclose) { - return "" + enclose + withEscapes + enclose; - } else { - return withEscapes; - } + return org.apache.sqoop.lib.FieldFormatter.escapeAndEnclose( + str, delimiters); } } diff --git a/src/java/com/cloudera/sqoop/lib/FieldMapProcessor.java b/src/java/com/cloudera/sqoop/lib/FieldMapProcessor.java index 182c90ba..3f215401 100644 --- a/src/java/com/cloudera/sqoop/lib/FieldMapProcessor.java +++ b/src/java/com/cloudera/sqoop/lib/FieldMapProcessor.java @@ -1,6 +1,4 @@ /** - * Copyright 2011 The Apache Software Foundation - * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -17,24 +15,15 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package com.cloudera.sqoop.lib; -import java.io.IOException; - /** * Interface implemented by classes that process FieldMappable objects. + * + * @deprecated use org.apache.sqoop.lib.FieldMapProcessor instead. + * @see org.apache.sqoop.lib.FieldMapProcessor */ -public interface FieldMapProcessor { - - /** - * Allow arbitrary processing of a FieldMappable object. - * @param record an object which can emit a map of its field names to values. - * @throws IOException if the processor encounters an IO error when - * operating on this object. - * @throws ProcessingException if the FieldMapProcessor encounters - * a general processing error when operating on this object. - */ - void accept(FieldMappable record) throws IOException, ProcessingException; +public interface FieldMapProcessor + extends org.apache.sqoop.lib.FieldMapProcessor { } diff --git a/src/java/com/cloudera/sqoop/lib/FieldMappable.java b/src/java/com/cloudera/sqoop/lib/FieldMappable.java index 762e402e..2067ecc8 100644 --- a/src/java/com/cloudera/sqoop/lib/FieldMappable.java +++ b/src/java/com/cloudera/sqoop/lib/FieldMappable.java @@ -1,6 +1,4 @@ /** - * Copyright 2011 The Apache Software Foundation - * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -17,22 +15,16 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package com.cloudera.sqoop.lib; -import java.util.Map; - /** * Interface describing a class capable of returning a map of the fields * of the object to their values. + * + * @deprecated use org.apache.sqoop.lib.FieldMappable instead. + * @see org.apache.sqoop.lib.FieldMappable */ -public interface FieldMappable { +public interface FieldMappable extends org.apache.sqoop.lib.FieldMappable { - /** - * Returns a map containing all fields of this record. - * @return a map from column names to the object-based values for - * this record. The map may not be null, though it may be empty. - */ - Map getFieldMap(); } diff --git a/src/java/com/cloudera/sqoop/lib/JdbcWritableBridge.java b/src/java/com/cloudera/sqoop/lib/JdbcWritableBridge.java index 0f75868a..316547f1 100644 --- a/src/java/com/cloudera/sqoop/lib/JdbcWritableBridge.java +++ b/src/java/com/cloudera/sqoop/lib/JdbcWritableBridge.java @@ -1,6 +1,4 @@ /** - * Copyright 2011 The Apache Software Foundation - * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -17,10 +15,8 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package com.cloudera.sqoop.lib; -import org.apache.hadoop.io.BytesWritable; import java.math.BigDecimal; import java.sql.Date; import java.sql.PreparedStatement; @@ -29,228 +25,163 @@ import java.sql.Time; import java.sql.Timestamp; +import org.apache.hadoop.io.BytesWritable; + /** * Contains a set of methods which can read db columns from a ResultSet into * Java types, and do serialization of these types to/from DataInput/DataOutput * for use with Hadoop's Writable implementation. This supports null values * for all types. + * + * @deprecated use org.apache.sqoop.lib.JdbcWritableBridge instead. + * @see org.apache.sqoop.lib.JdbcWritableBridge */ public final class JdbcWritableBridge { // Currently, cap BLOB/CLOB objects at 16 MB until we can use external // storage. - public static final long MAX_BLOB_LENGTH = 16 * 1024 * 1024; - public static final long MAX_CLOB_LENGTH = 16 * 1024 * 1024; + public static final long MAX_BLOB_LENGTH = + org.apache.sqoop.lib.JdbcWritableBridge.MAX_BLOB_LENGTH; + public static final long MAX_CLOB_LENGTH = + org.apache.sqoop.lib.JdbcWritableBridge.MAX_CLOB_LENGTH; private JdbcWritableBridge() { } public static Integer readInteger(int colNum, ResultSet r) throws SQLException { - int val; - val = r.getInt(colNum); - if (r.wasNull()) { - return null; - } else { - return Integer.valueOf(val); - } + return org.apache.sqoop.lib.JdbcWritableBridge.readInteger(colNum, r); } public static Long readLong(int colNum, ResultSet r) throws SQLException { - long val; - val = r.getLong(colNum); - if (r.wasNull()) { - return null; - } else { - return Long.valueOf(val); - } + return org.apache.sqoop.lib.JdbcWritableBridge.readLong(colNum, r); } public static String readString(int colNum, ResultSet r) throws SQLException { - return r.getString(colNum); + return org.apache.sqoop.lib.JdbcWritableBridge.readString(colNum, r); } public static Float readFloat(int colNum, ResultSet r) throws SQLException { - float val; - val = r.getFloat(colNum); - if (r.wasNull()) { - return null; - } else { - return Float.valueOf(val); - } + return org.apache.sqoop.lib.JdbcWritableBridge.readFloat(colNum, r); } public static Double readDouble(int colNum, ResultSet r) throws SQLException { - double val; - val = r.getDouble(colNum); - if (r.wasNull()) { - return null; - } else { - return Double.valueOf(val); - } + return org.apache.sqoop.lib.JdbcWritableBridge.readDouble(colNum, r); } public static Boolean readBoolean(int colNum, ResultSet r) throws SQLException { - boolean val; - val = r.getBoolean(colNum); - if (r.wasNull()) { - return null; - } else { - return Boolean.valueOf(val); - } + return org.apache.sqoop.lib.JdbcWritableBridge.readBoolean(colNum, r); } public static Time readTime(int colNum, ResultSet r) throws SQLException { - return r.getTime(colNum); + return org.apache.sqoop.lib.JdbcWritableBridge.readTime(colNum, r); } public static Timestamp readTimestamp(int colNum, ResultSet r) throws SQLException { - return r.getTimestamp(colNum); + return org.apache.sqoop.lib.JdbcWritableBridge.readTimestamp(colNum, r); } public static Date readDate(int colNum, ResultSet r) throws SQLException { - return r.getDate(colNum); + return org.apache.sqoop.lib.JdbcWritableBridge.readDate(colNum, r); } public static BytesWritable readBytesWritable(int colNum, ResultSet r) throws SQLException { - byte [] bytes = r.getBytes(colNum); - return bytes == null ? null : new BytesWritable(bytes); + return org.apache.sqoop.lib.JdbcWritableBridge.readBytesWritable(colNum, r); } public static BigDecimal readBigDecimal(int colNum, ResultSet r) throws SQLException { - return r.getBigDecimal(colNum); + return org.apache.sqoop.lib.JdbcWritableBridge.readBigDecimal(colNum, r); } public static BlobRef readBlobRef(int colNum, ResultSet r) throws SQLException { - // Loading of BLOBs is delayed; handled by LargeObjectLoader. - return null; + return org.apache.sqoop.lib.JdbcWritableBridge.readBlobRef(colNum, r); } public static ClobRef readClobRef(int colNum, ResultSet r) throws SQLException { - // Loading of CLOBs is delayed; handled by LargeObjectLoader. - return null; + return org.apache.sqoop.lib.JdbcWritableBridge.readClobRef(colNum, r); } public static void writeInteger(Integer val, int paramIdx, int sqlType, PreparedStatement s) throws SQLException { - if (null == val) { - s.setNull(paramIdx, sqlType); - } else { - s.setInt(paramIdx, val); - } + org.apache.sqoop.lib.JdbcWritableBridge.writeInteger( + val, paramIdx, sqlType, s); } public static void writeLong(Long val, int paramIdx, int sqlType, PreparedStatement s) throws SQLException { - if (null == val) { - s.setNull(paramIdx, sqlType); - } else { - s.setLong(paramIdx, val); - } + org.apache.sqoop.lib.JdbcWritableBridge.writeLong( + val, paramIdx, sqlType, s); } public static void writeDouble(Double val, int paramIdx, int sqlType, PreparedStatement s) throws SQLException { - if (null == val) { - s.setNull(paramIdx, sqlType); - } else { - s.setDouble(paramIdx, val); - } + org.apache.sqoop.lib.JdbcWritableBridge.writeDouble( + val, paramIdx, sqlType, s); } public static void writeBoolean(Boolean val, int paramIdx, int sqlType, PreparedStatement s) throws SQLException { - if (null == val) { - s.setNull(paramIdx, sqlType); - } else { - s.setBoolean(paramIdx, val); - } + org.apache.sqoop.lib.JdbcWritableBridge.writeBoolean( + val, paramIdx, sqlType, s); } public static void writeFloat(Float val, int paramIdx, int sqlType, PreparedStatement s) throws SQLException { - if (null == val) { - s.setNull(paramIdx, sqlType); - } else { - s.setFloat(paramIdx, val); - } + org.apache.sqoop.lib.JdbcWritableBridge.writeFloat( + val, paramIdx, sqlType, s); } public static void writeString(String val, int paramIdx, int sqlType, PreparedStatement s) throws SQLException { - if (null == val) { - s.setNull(paramIdx, sqlType); - } else { - s.setString(paramIdx, val); - } + org.apache.sqoop.lib.JdbcWritableBridge.writeString( + val, paramIdx, sqlType, s); } public static void writeTimestamp(Timestamp val, int paramIdx, int sqlType, PreparedStatement s) throws SQLException { - if (null == val) { - s.setNull(paramIdx, sqlType); - } else { - s.setTimestamp(paramIdx, val); - } + org.apache.sqoop.lib.JdbcWritableBridge.writeTimestamp( + val, paramIdx, sqlType, s); } public static void writeTime(Time val, int paramIdx, int sqlType, PreparedStatement s) throws SQLException { - if (null == val) { - s.setNull(paramIdx, sqlType); - } else { - s.setTime(paramIdx, val); - } + org.apache.sqoop.lib.JdbcWritableBridge.writeTime( + val, paramIdx, sqlType, s); } public static void writeDate(Date val, int paramIdx, int sqlType, PreparedStatement s) throws SQLException { - if (null == val) { - s.setNull(paramIdx, sqlType); - } else { - s.setDate(paramIdx, val); - } + org.apache.sqoop.lib.JdbcWritableBridge.writeDate( + val, paramIdx, sqlType, s); } public static void writeBytesWritable(BytesWritable val, int paramIdx, int sqlType, PreparedStatement s) throws SQLException { - if (null == val) { - s.setNull(paramIdx, sqlType); - } else { - // val.getBytes() is only valid in [0, len) - byte [] rawBytes = val.getBytes(); - int len = val.getLength(); - byte [] outBytes = new byte[len]; - System.arraycopy(rawBytes, 0, outBytes, 0, len); - s.setBytes(paramIdx, outBytes); - } + org.apache.sqoop.lib.JdbcWritableBridge.writeBytesWritable( + val, paramIdx, sqlType, s); } - public static void writeBigDecimal(BigDecimal val, int paramIdx, int sqlType, PreparedStatement s) throws SQLException { - if (null == val) { - s.setNull(paramIdx, sqlType); - } else { - s.setBigDecimal(paramIdx, val); - } + org.apache.sqoop.lib.JdbcWritableBridge.writeBigDecimal( + val, paramIdx, sqlType, s); } public static void writeBlobRef(BlobRef val, int paramIdx, int sqlType, PreparedStatement s) throws SQLException { - // TODO: support this. - throw new RuntimeException("Unsupported: Cannot export BLOB data"); + org.apache.sqoop.lib.JdbcWritableBridge.writeBlobRef( + val, paramIdx, sqlType, s); } public static void writeClobRef(ClobRef val, int paramIdx, int sqlType, PreparedStatement s) throws SQLException { - // TODO: support this. - throw new RuntimeException("Unsupported: Cannot export CLOB data"); + org.apache.sqoop.lib.JdbcWritableBridge.writeClobRef( + val, paramIdx, sqlType, s); } } diff --git a/src/java/com/cloudera/sqoop/lib/LargeObjectLoader.java b/src/java/com/cloudera/sqoop/lib/LargeObjectLoader.java index a189f0b4..43df7fc4 100644 --- a/src/java/com/cloudera/sqoop/lib/LargeObjectLoader.java +++ b/src/java/com/cloudera/sqoop/lib/LargeObjectLoader.java @@ -1,6 +1,4 @@ /** - * Copyright 2011 The Apache Software Foundation - * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -17,26 +15,12 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package com.cloudera.sqoop.lib; -import java.io.Closeable; -import java.io.File; import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; -import java.io.Reader; -import java.io.Writer; -import java.sql.Blob; -import java.sql.Clob; -import java.sql.ResultSet; -import java.sql.SQLException; import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import com.cloudera.sqoop.io.LobFile; -import com.cloudera.sqoop.util.TaskId; /** * Contains a set of methods which can read db columns from a ResultSet into @@ -47,26 +31,18 @@ * This is a singleton instance class; only one may exist at a time. * However, its lifetime is limited to the current TaskInputOutputContext's * life. + * + * @deprecated use org.apache.sqoop.lib.LargeObjectLoader instead. + * @see org.apache.sqoop.lib.LargeObjectLoader */ -public class LargeObjectLoader implements Closeable { +public class LargeObjectLoader extends org.apache.sqoop.lib.LargeObjectLoader { // Spill to external storage for BLOB/CLOB objects > 16 MB. - public static final long DEFAULT_MAX_LOB_LENGTH = 16 * 1024 * 1024; + public static final long DEFAULT_MAX_LOB_LENGTH = + org.apache.sqoop.lib.LargeObjectLoader.DEFAULT_MAX_LOB_LENGTH; public static final String MAX_INLINE_LOB_LEN_KEY = - "sqoop.inline.lob.length.max"; - - private Configuration conf; - private Path workPath; - private FileSystem fs; - - // Handles to the open BLOB / CLOB file writers. - private LobFile.Writer curBlobWriter; - private LobFile.Writer curClobWriter; - - // Counter that is used with the current task attempt id to - // generate unique LOB file names. - private long nextLobFileId = 0; + org.apache.sqoop.lib.LargeObjectLoader.MAX_INLINE_LOB_LEN_KEY; /** * Create a new LargeObjectLoader. @@ -75,246 +51,6 @@ public class LargeObjectLoader implements Closeable { */ public LargeObjectLoader(Configuration conf, Path workPath) throws IOException { - this.conf = conf; - this.workPath = workPath; - this.fs = FileSystem.get(conf); - this.curBlobWriter = null; - this.curClobWriter = null; - } - - @Override - protected synchronized void finalize() throws Throwable { - close(); - super.finalize(); - } - - @Override - public void close() throws IOException { - if (null != curBlobWriter) { - curBlobWriter.close(); - curBlobWriter = null; - } - - if (null != curClobWriter) { - curClobWriter.close(); - curClobWriter = null; - } - } - - /** - * @return a filename to use to put an external LOB in. - */ - private String getNextLobFileName() { - String file = "_lob/large_obj_" + TaskId.get(conf, "unknown_task_id") - + nextLobFileId + ".lob"; - nextLobFileId++; - - return file; - } - - /** - * Calculates a path to a new LobFile object, creating any - * missing directories. - * @return a Path to a LobFile to write - */ - private Path getNextLobFilePath() throws IOException { - Path p = new Path(workPath, getNextLobFileName()); - Path parent = p.getParent(); - if (!fs.exists(parent)) { - fs.mkdirs(parent); - } - - return p; - } - - /** - * @return the current LobFile writer for BLOBs, creating one if necessary. - */ - private LobFile.Writer getBlobWriter() throws IOException { - if (null == this.curBlobWriter) { - this.curBlobWriter = LobFile.create(getNextLobFilePath(), conf, false); - } - - return this.curBlobWriter; - } - - /** - * @return the current LobFile writer for CLOBs, creating one if necessary. - */ - private LobFile.Writer getClobWriter() throws IOException { - if (null == this.curClobWriter) { - this.curClobWriter = LobFile.create(getNextLobFilePath(), conf, true); - } - - return this.curClobWriter; - } - - /** - * Returns the path being written to by a given LobFile.Writer, relative - * to the working directory of this LargeObjectLoader. - * @param w the LobFile.Writer whose path should be examined. - * @return the path this is writing to, relative to the current working dir. - */ - private String getRelativePath(LobFile.Writer w) { - Path writerPath = w.getPath(); - - String writerPathStr = writerPath.toString(); - String workPathStr = workPath.toString(); - if (!workPathStr.endsWith(File.separator)) { - workPathStr = workPathStr + File.separator; - } - - if (writerPathStr.startsWith(workPathStr)) { - return writerPathStr.substring(workPathStr.length()); - } - - // Outside the working dir; return the whole thing. - return writerPathStr; - } - - /** - * Copies all character data from the provided Reader to the provided - * Writer. Does not close handles when it's done. - * @param reader data source - * @param writer data sink - * @throws IOException if an I/O error occurs either reading or writing. - */ - private void copyAll(Reader reader, Writer writer) throws IOException { - int bufferSize = conf.getInt("io.file.buffer.size", - 4096); - char [] buf = new char[bufferSize]; - - while (true) { - int charsRead = reader.read(buf); - if (-1 == charsRead) { - break; // no more stream to read. - } - writer.write(buf, 0, charsRead); - } - } - - /** - * Copies all byte data from the provided InputStream to the provided - * OutputStream. Does not close handles when it's done. - * @param input data source - * @param output data sink - * @throws IOException if an I/O error occurs either reading or writing. - */ - private void copyAll(InputStream input, OutputStream output) - throws IOException { - int bufferSize = conf.getInt("io.file.buffer.size", - 4096); - byte [] buf = new byte[bufferSize]; - - while (true) { - int bytesRead = input.read(buf, 0, bufferSize); - if (-1 == bytesRead) { - break; // no more stream to read. - } - output.write(buf, 0, bytesRead); - } - } - - /** - * Actually read a BlobRef instance from the ResultSet and materialize - * the data either inline or to a file. - * - * @param colNum the column of the ResultSet's current row to read. - * @param r the ResultSet to read from. - * @return a BlobRef encapsulating the data in this field. - * @throws IOException if an error occurs writing to the FileSystem. - * @throws SQLException if an error occurs reading from the database. - */ - public BlobRef readBlobRef(int colNum, ResultSet r) - throws IOException, InterruptedException, SQLException { - - long maxInlineLobLen = conf.getLong( - MAX_INLINE_LOB_LEN_KEY, - DEFAULT_MAX_LOB_LENGTH); - - Blob b = r.getBlob(colNum); - if (null == b) { - return null; - } else if (b.length() > maxInlineLobLen) { - // Deserialize very large BLOBs into separate files. - long len = b.length(); - LobFile.Writer lobWriter = getBlobWriter(); - - long recordOffset = lobWriter.tell(); - InputStream is = null; - OutputStream os = lobWriter.writeBlobRecord(len); - try { - is = b.getBinaryStream(); - copyAll(is, os); - } finally { - if (null != os) { - os.close(); - } - - if (null != is) { - is.close(); - } - - // Mark the record as finished. - lobWriter.finishRecord(); - } - - return new BlobRef(getRelativePath(curBlobWriter), recordOffset, len); - } else { - // This is a 1-based array. - return new BlobRef(b.getBytes(1, (int) b.length())); - } - } - - - /** - * Actually read a ClobRef instance from the ResultSet and materialize - * the data either inline or to a file. - * - * @param colNum the column of the ResultSet's current row to read. - * @param r the ResultSet to read from. - * @return a ClobRef encapsulating the data in this field. - * @throws IOException if an error occurs writing to the FileSystem. - * @throws SQLException if an error occurs reading from the database. - */ - public ClobRef readClobRef(int colNum, ResultSet r) - throws IOException, InterruptedException, SQLException { - - long maxInlineLobLen = conf.getLong( - MAX_INLINE_LOB_LEN_KEY, - DEFAULT_MAX_LOB_LENGTH); - - Clob c = r.getClob(colNum); - if (null == c) { - return null; - } else if (c.length() > maxInlineLobLen) { - // Deserialize large CLOB into separate file. - long len = c.length(); - LobFile.Writer lobWriter = getClobWriter(); - - long recordOffset = lobWriter.tell(); - Reader reader = null; - Writer w = lobWriter.writeClobRecord(len); - try { - reader = c.getCharacterStream(); - copyAll(reader, w); - } finally { - if (null != w) { - w.close(); - } - - if (null != reader) { - reader.close(); - } - - // Mark the record as finished. - lobWriter.finishRecord(); - } - - return new ClobRef(getRelativePath(lobWriter), recordOffset, len); - } else { - // This is a 1-based array. - return new ClobRef(c.getSubString(1, (int) c.length())); - } + super(conf, workPath); } } diff --git a/src/java/com/cloudera/sqoop/lib/LobRef.java b/src/java/com/cloudera/sqoop/lib/LobRef.java index 5509555e..518b6226 100644 --- a/src/java/com/cloudera/sqoop/lib/LobRef.java +++ b/src/java/com/cloudera/sqoop/lib/LobRef.java @@ -1,6 +1,4 @@ /** - * Copyright 2011 The Apache Software Foundation - * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -17,28 +15,11 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package com.cloudera.sqoop.lib; -import java.io.Closeable; -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.io.Writable; -import org.apache.hadoop.mapreduce.InputSplit; -import org.apache.hadoop.mapreduce.Mapper; -import org.apache.hadoop.mapreduce.lib.input.FileSplit; -import com.cloudera.sqoop.io.LobFile; -import com.cloudera.sqoop.io.LobReaderCache; import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; /** * Abstract base class that holds a reference to a Blob or a Clob. @@ -46,284 +27,28 @@ * CONTAINERTYPE is the type used to hold this data (e.g., BytesWritable). * ACCESSORTYPE is the type used to access this data in a streaming fashion * (either an InputStream or a Reader). + * + * @deprecated use org.apache.sqoop.lib.LobRef instead. + * @see org.apache.sqoop.lib.LobRef */ public abstract class LobRef - implements Closeable, Writable { + extends org.apache.sqoop.lib.LobRef { - public static final Log LOG = LogFactory.getLog(LobRef.class.getName()); + public static final Log LOG = org.apache.sqoop.lib.LobRef.LOG; protected LobRef() { - this.fileName = null; - this.offset = 0; - this.length = 0; - - this.realData = null; + super(); } protected LobRef(CONTAINERTYPE container) { - this.fileName = null; - this.offset = 0; - this.length = 0; - - this.realData = container; + super(container); } protected LobRef(String file, long offset, long length) { - this.fileName = file; - this.offset = offset; - this.length = length; - - this.realData = null; + super(file, offset, length); } - // If the data is 'small', it's held directly, here. - private CONTAINERTYPE realData; - - /** Internal API to retrieve the data object. */ - protected CONTAINERTYPE getDataObj() { - return realData; - } - - /** Internal API to set the data object. */ - protected void setDataObj(CONTAINERTYPE data) { - this.realData = data; - } - - // If there data is too large to materialize fully, it's written into a file - // whose path (relative to the rest of the dataset) is recorded here. This - // takes precedence if the value fof fileName is non-null. These records are - // currently written into LobFile-formatted files, which hold multiple - // records. The starting offset and length of the record are recorded here - // as well. - private String fileName; - private long offset; - private long length; - - // If we've opened a LobFile object, track our reference to it here. - private LobFile.Reader lobReader; - - @Override - @SuppressWarnings("unchecked") - /** - * Clone the current reference object. data is deep-copied; any open - * file handle remains with the original only. - */ - public Object clone() throws CloneNotSupportedException { - LobRef r = - (LobRef) super.clone(); - - r.lobReader = null; // Reference to opened reader is not duplicated. - if (null != realData) { - r.realData = deepCopyData(realData); - } - - return r; - } - - @Override - protected synchronized void finalize() throws Throwable { - close(); - super.finalize(); - } - - public void close() throws IOException { - // Discard any open LobReader. - if (null != this.lobReader) { - LobReaderCache.getCache().recycle(this.lobReader); - } - } - - /** - * @return true if the LOB data is in an external file; false if - * it materialized inline. - */ - public boolean isExternal() { - return fileName != null; - } - - /** - * Convenience method to access #getDataStream(Configuration, Path) - * from within a map task that read this LobRef from a file-based - * InputSplit. - * @param mapContext the Mapper.Context instance that encapsulates - * the current map task. - * @return an object that lazily streams the record to the client. - * @throws IllegalArgumentException if it cannot find the source - * path for this LOB based on the MapContext. - * @throws IOException if it could not read the LOB from external storage. - */ - public ACCESSORTYPE getDataStream(Mapper.Context mapContext) - throws IOException { - InputSplit split = mapContext.getInputSplit(); - if (split instanceof FileSplit) { - Path basePath = ((FileSplit) split).getPath().getParent(); - return getDataStream(mapContext.getConfiguration(), - basePath); - } else { - throw new IllegalArgumentException( - "Could not ascertain LOB base path from MapContext."); - } - } - - /** - * Get access to the LOB data itself. - * This method returns a lazy reader of the LOB data, accessing the - * filesystem for external LOB storage as necessary. - * @param conf the Configuration used to access the filesystem - * @param basePath the base directory where the table records are - * stored. - * @return an object that lazily streams the record to the client. - * @throws IOException if it could not read the LOB from external storage. - */ - public ACCESSORTYPE getDataStream(Configuration conf, Path basePath) - throws IOException { - if (isExternal()) { - // Read from external storage. - Path pathToRead = LobReaderCache.qualify( - new Path(basePath, fileName), conf); - LOG.debug("Retreving data stream from external path: " + pathToRead); - if (lobReader != null) { - // We already have a reader open to a LobFile. Is it the correct file? - if (!pathToRead.equals(lobReader.getPath())) { - // No. Close this.lobReader and get the correct one. - LOG.debug("Releasing previous external reader for " - + lobReader.getPath()); - LobReaderCache.getCache().recycle(lobReader); - lobReader = LobReaderCache.getCache().get(pathToRead, conf); - } - } else { - lobReader = LobReaderCache.getCache().get(pathToRead, conf); - } - - // We now have a LobFile.Reader associated with the correct file. Get to - // the correct offset and return an InputStream/Reader to the user. - if (lobReader.tell() != offset) { - LOG.debug("Seeking to record start offset " + offset); - lobReader.seek(offset); - } - - if (!lobReader.next()) { - throw new IOException("Could not locate record at " + pathToRead - + ":" + offset); - } - - return getExternalSource(lobReader); - } else { - // This data is already materialized in memory; wrap it and return. - return getInternalSource(realData); - } - } - - /** - * Using the LobFile reader, get an accessor InputStream or Reader to the - * underlying data. - */ - protected abstract ACCESSORTYPE getExternalSource(LobFile.Reader reader) - throws IOException; - - /** - * Wrap the materialized data in an InputStream or Reader. - */ - protected abstract ACCESSORTYPE getInternalSource(CONTAINERTYPE data); - - /** - * @return the materialized data itself. - */ - protected abstract DATATYPE getInternalData(CONTAINERTYPE data); - - /** - * Make a copy of the materialized data. - */ - protected abstract CONTAINERTYPE deepCopyData(CONTAINERTYPE data); - - public DATATYPE getData() { - if (isExternal()) { - throw new RuntimeException( - "External LOBs must be read via getDataStream()"); - } - - return getInternalData(realData); - } - - @Override - public String toString() { - if (isExternal()) { - return "externalLob(lf," + fileName + "," + Long.toString(offset) - + "," + Long.toString(length) + ")"; - } else { - return realData.toString(); - } - } - - @Override - public void readFields(DataInput in) throws IOException { - // The serialization format for this object is: - // boolean isExternal - // if true, then: - // a string identifying the external storage type - // and external-storage-specific data. - // if false, then we use readFieldsInternal() to allow BlobRef/ClobRef - // to serialize as it sees fit. - // - // Currently the only external storage supported is LobFile, identified - // by the string "lf". This serializes with the filename (as a string), - // followed by a long-valued offset and a long-valued length. - - boolean isExternal = in.readBoolean(); - if (isExternal) { - this.realData = null; - - String storageType = Text.readString(in); - if (!storageType.equals("lf")) { - throw new IOException("Unsupported external LOB storage code: " - + storageType); - } - - // Storage type "lf" is LobFile: filename, offset, length. - this.fileName = Text.readString(in); - this.offset = in.readLong(); - this.length = in.readLong(); - } else { - readFieldsInternal(in); - - this.fileName = null; - this.offset = 0; - this.length = 0; - } - } - - /** - * Perform the readFields() operation on a fully-materializable record. - * @param in the DataInput to deserialize from. - */ - protected abstract void readFieldsInternal(DataInput in) throws IOException; - - @Override - public void write(DataOutput out) throws IOException { - out.writeBoolean(isExternal()); - if (isExternal()) { - Text.writeString(out, "lf"); // storage type "lf" for LobFile. - Text.writeString(out, fileName); - out.writeLong(offset); - out.writeLong(length); - } else { - writeInternal(out); - } - } - - /** - * Perform the write() operation on a fully-materializable record. - * @param out the DataOutput to deserialize to. - */ - protected abstract void writeInternal(DataOutput out) throws IOException; - protected static final ThreadLocal EXTERNAL_MATCHER = - new ThreadLocal() { - @Override protected Matcher initialValue() { - Pattern externalPattern = Pattern.compile( - "externalLob\\(lf,(.*),([0-9]+),([0-9]+)\\)"); - return externalPattern.matcher(""); - } - }; + org.apache.sqoop.lib.LobRef.EXTERNAL_MATCHER; } diff --git a/src/java/com/cloudera/sqoop/lib/LobSerializer.java b/src/java/com/cloudera/sqoop/lib/LobSerializer.java index 906db7f2..b8324fe2 100644 --- a/src/java/com/cloudera/sqoop/lib/LobSerializer.java +++ b/src/java/com/cloudera/sqoop/lib/LobSerializer.java @@ -1,6 +1,4 @@ /** - * Copyright 2011 The Apache Software Foundation - * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -17,7 +15,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package com.cloudera.sqoop.lib; import java.io.DataInput; @@ -26,6 +23,9 @@ /** * Serialize LOB classes to/from DataInput and DataOutput objects. + * + * @deprecated use org.apache.sqoop.lib.LobSerializer instead. + * @see org.apache.sqoop.lib.LobSerializer */ public final class LobSerializer { @@ -33,23 +33,19 @@ private LobSerializer() { } public static void writeClob(ClobRef clob, DataOutput out) throws IOException { - clob.write(out); + org.apache.sqoop.lib.LobSerializer.writeClob(clob, out); } public static void writeBlob(BlobRef blob, DataOutput out) throws IOException { - blob.write(out); + org.apache.sqoop.lib.LobSerializer.writeBlob(blob, out); } public static ClobRef readClobFields(DataInput in) throws IOException { - ClobRef clob = new ClobRef(); - clob.readFields(in); - return clob; + return org.apache.sqoop.lib.LobSerializer.readClobFields(in); } public static BlobRef readBlobFields(DataInput in) throws IOException { - BlobRef blob = new BlobRef(); - blob.readFields(in); - return blob; + return org.apache.sqoop.lib.LobSerializer.readBlobFields(in); } } diff --git a/src/java/com/cloudera/sqoop/lib/ProcessingException.java b/src/java/com/cloudera/sqoop/lib/ProcessingException.java index 72ee7ba3..c4216b19 100644 --- a/src/java/com/cloudera/sqoop/lib/ProcessingException.java +++ b/src/java/com/cloudera/sqoop/lib/ProcessingException.java @@ -1,6 +1,4 @@ /** - * Copyright 2011 The Apache Software Foundation - * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -17,14 +15,17 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package com.cloudera.sqoop.lib; /** * General error during processing of a SqoopRecord. + * + * @deprecated use org.apache.sqoop.lib.ProcessingException instead. + * @see org.apache.sqoop.lib.ProcessingException */ @SuppressWarnings("serial") -public class ProcessingException extends Exception { +public class ProcessingException + extends org.apache.sqoop.lib.ProcessingException { public ProcessingException() { super("ProcessingException"); @@ -41,10 +42,4 @@ public ProcessingException(final Throwable cause) { public ProcessingException(final String message, final Throwable cause) { super(message, cause); } - - @Override - public String toString() { - String msg = getMessage(); - return (null == msg) ? "ProcessingException" : msg; - } } diff --git a/src/java/com/cloudera/sqoop/lib/RecordParser.java b/src/java/com/cloudera/sqoop/lib/RecordParser.java index f906589e..a3238e82 100644 --- a/src/java/com/cloudera/sqoop/lib/RecordParser.java +++ b/src/java/com/cloudera/sqoop/lib/RecordParser.java @@ -1,6 +1,4 @@ /** - * Copyright 2011 The Apache Software Foundation - * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -17,18 +15,9 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package com.cloudera.sqoop.lib; import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; - -import org.apache.hadoop.io.Text; - -import java.nio.ByteBuffer; -import java.nio.CharBuffer; -import java.util.ArrayList; -import java.util.List; /** * Parses a record containing one or more fields. Fields are separated @@ -53,26 +42,25 @@ * The fields parsed by RecordParser are backed by an internal buffer * which is cleared when the next call to parseRecord() is made. If * the buffer is required to be preserved, you must copy it yourself. + * + * @deprecated use org.apache.sqoop.lib.RecordParser instead. + * @see org.apache.sqoop.lib.RecordParser */ -public final class RecordParser { +public final class RecordParser extends org.apache.sqoop.lib.RecordParser { - public static final Log LOG = LogFactory.getLog(RecordParser.class.getName()); - - private enum ParseState { - FIELD_START, - ENCLOSED_FIELD, - UNENCLOSED_FIELD, - ENCLOSED_ESCAPE, - ENCLOSED_EXPECT_DELIMITER, - UNENCLOSED_ESCAPE - } + public static final Log LOG = org.apache.sqoop.lib.RecordParser.LOG; /** * An error thrown when parsing fails. + * + * @deprecated use org.apache.sqoop.lib.RecordParser.ParseError instead. + * @see org.apache.sqoop.lib.RecordParser.ParseError */ - public static class ParseError extends Exception { + public static class ParseError + extends org.apache.sqoop.lib.RecordParser.ParseError { + public ParseError() { - super("ParseError"); + super(); } public ParseError(final String msg) { @@ -88,273 +76,7 @@ public ParseError(final Throwable cause) { } } - private DelimiterSet delimiters; - private ArrayList outputs; - public RecordParser(final DelimiterSet delimitersIn) { - this.delimiters = delimitersIn.copy(); - this.outputs = new ArrayList(); - } - - /** - * Return a list of strings representing the fields of the input line. - * This list is backed by an internal buffer which is cleared by the - * next call to parseRecord(). - */ - public List parseRecord(CharSequence input) throws ParseError { - if (null == input) { - throw new ParseError("null input string"); - } - - return parseRecord(CharBuffer.wrap(input)); - } - - /** - * Return a list of strings representing the fields of the input line. - * This list is backed by an internal buffer which is cleared by the - * next call to parseRecord(). - */ - public List parseRecord(Text input) throws ParseError { - if (null == input) { - throw new ParseError("null input string"); - } - - // TODO(aaron): The parser should be able to handle UTF-8 strings - // as well, to avoid this transcode operation. - return parseRecord(input.toString()); - } - - /** - * Return a list of strings representing the fields of the input line. - * This list is backed by an internal buffer which is cleared by the - * next call to parseRecord(). - */ - public List parseRecord(byte [] input) throws ParseError { - if (null == input) { - throw new ParseError("null input string"); - } - - return parseRecord(ByteBuffer.wrap(input).asCharBuffer()); - } - - /** - * Return a list of strings representing the fields of the input line. - * This list is backed by an internal buffer which is cleared by the - * next call to parseRecord(). - */ - public List parseRecord(char [] input) throws ParseError { - if (null == input) { - throw new ParseError("null input string"); - } - - return parseRecord(CharBuffer.wrap(input)); - } - - public List parseRecord(ByteBuffer input) throws ParseError { - if (null == input) { - throw new ParseError("null input string"); - } - - return parseRecord(input.asCharBuffer()); - } - - // TODO(aaron): Refactor this method to be much shorter. - // CHECKSTYLE:OFF - /** - * Return a list of strings representing the fields of the input line. - * This list is backed by an internal buffer which is cleared by the - * next call to parseRecord(). - */ - public List parseRecord(CharBuffer input) throws ParseError { - if (null == input) { - throw new ParseError("null input string"); - } - - /* - This method implements the following state machine to perform - parsing. - - Note that there are no restrictions on whether particular characters - (e.g., field-sep, record-sep, etc) are distinct or the same. The - state transitions are processed in the order seen in this comment. - - Starting state is FIELD_START - encloser -> ENCLOSED_FIELD - escape char -> UNENCLOSED_ESCAPE - field delim -> FIELD_START (for a new field) - record delim -> stops processing - all other letters get added to current field, -> UNENCLOSED FIELD - - ENCLOSED_FIELD state: - escape char goes to ENCLOSED_ESCAPE - encloser goes to ENCLOSED_EXPECT_DELIMITER - field sep or record sep gets added to the current string - normal letters get added to the current string - - ENCLOSED_ESCAPE state: - any character seen here is added literally, back to ENCLOSED_FIELD - - ENCLOSED_EXPECT_DELIMITER state: - field sep goes to FIELD_START - record sep halts processing. - all other characters are errors. - - UNENCLOSED_FIELD state: - ESCAPE char goes to UNENCLOSED_ESCAPE - FIELD_SEP char goes to FIELD_START - RECORD_SEP char halts processing - normal chars or the enclosing char get added to the current string - - UNENCLOSED_ESCAPE: - add charater literal to current string, return to UNENCLOSED_FIELD - */ - - char curChar = DelimiterSet.NULL_CHAR; - ParseState state = ParseState.FIELD_START; - int len = input.length(); - StringBuilder sb = null; - - outputs.clear(); - - char enclosingChar = delimiters.getEnclosedBy(); - char fieldDelim = delimiters.getFieldsTerminatedBy(); - char recordDelim = delimiters.getLinesTerminatedBy(); - char escapeChar = delimiters.getEscapedBy(); - boolean enclosingRequired = delimiters.isEncloseRequired(); - - for (int pos = 0; pos < len; pos++) { - curChar = input.get(); - switch (state) { - case FIELD_START: - // ready to start processing a new field. - if (null != sb) { - // We finished processing a previous field. Add to the list. - outputs.add(sb.toString()); - } - - sb = new StringBuilder(); - if (enclosingChar == curChar) { - // got an opening encloser. - state = ParseState.ENCLOSED_FIELD; - } else if (escapeChar == curChar) { - state = ParseState.UNENCLOSED_ESCAPE; - } else if (fieldDelim == curChar) { - // we have a zero-length field. This is a no-op. - continue; - } else if (recordDelim == curChar) { - // we have a zero-length field, that ends processing. - pos = len; - } else { - // current char is part of the field. - state = ParseState.UNENCLOSED_FIELD; - sb.append(curChar); - - if (enclosingRequired) { - throw new ParseError( - "Opening field-encloser expected at position " + pos); - } - } - - break; - - case ENCLOSED_FIELD: - if (escapeChar == curChar) { - // the next character is escaped. Treat it literally. - state = ParseState.ENCLOSED_ESCAPE; - } else if (enclosingChar == curChar) { - // we're at the end of the enclosing field. Expect an EOF or EOR char. - state = ParseState.ENCLOSED_EXPECT_DELIMITER; - } else { - // this is a regular char, or an EOF / EOR inside an encloser. Add to - // the current field string, and remain in this state. - sb.append(curChar); - } - - break; - - case UNENCLOSED_FIELD: - if (escapeChar == curChar) { - // the next character is escaped. Treat it literally. - state = ParseState.UNENCLOSED_ESCAPE; - } else if (fieldDelim == curChar) { - // we're at the end of this field; may be the start of another one. - state = ParseState.FIELD_START; - } else if (recordDelim == curChar) { - pos = len; // terminate processing immediately. - } else { - // this is a regular char. Add to the current field string, - // and remain in this state. - sb.append(curChar); - } - - break; - - case ENCLOSED_ESCAPE: - // Treat this character literally, whatever it is, and return to - // enclosed field processing. - sb.append(curChar); - state = ParseState.ENCLOSED_FIELD; - break; - - case ENCLOSED_EXPECT_DELIMITER: - // We were in an enclosed field, but got the final encloser. Now we - // expect either an end-of-field or an end-of-record. - if (fieldDelim == curChar) { - // end of one field is the beginning of the next. - state = ParseState.FIELD_START; - } else if (recordDelim == curChar) { - // stop processing. - pos = len; - } else { - // Don't know what to do with this character. - throw new ParseError("Expected delimiter at position " + pos); - } - - break; - - case UNENCLOSED_ESCAPE: - // Treat this character literally, whatever it is, and return to - // non-enclosed field processing. - sb.append(curChar); - state = ParseState.UNENCLOSED_FIELD; - break; - - default: - throw new ParseError("Unexpected parser state: " + state); - } - } - - if (state == ParseState.FIELD_START && curChar == fieldDelim) { - // we hit an EOF/EOR as the last legal character and we need to mark - // that string as recorded. This if block is outside the for-loop since - // we don't have a physical 'epsilon' token in our string. - if (null != sb) { - outputs.add(sb.toString()); - sb = new StringBuilder(); - } - } - - if (null != sb) { - // There was a field that terminated by running out of chars or an EOR - // character. Add to the list. - outputs.add(sb.toString()); - } - - return outputs; - } - // CHECKSTYLE:ON - - public boolean isEnclosingRequired() { - return delimiters.isEncloseRequired(); - } - - @Override - public String toString() { - return "RecordParser[" + delimiters.toString() + "]"; - } - - @Override - public int hashCode() { - return this.delimiters.hashCode(); + super(delimitersIn); } } diff --git a/src/java/com/cloudera/sqoop/lib/SqoopRecord.java b/src/java/com/cloudera/sqoop/lib/SqoopRecord.java index eacebeb1..7cfcbb36 100644 --- a/src/java/com/cloudera/sqoop/lib/SqoopRecord.java +++ b/src/java/com/cloudera/sqoop/lib/SqoopRecord.java @@ -1,6 +1,4 @@ /** - * Copyright 2011 The Apache Software Foundation - * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -17,136 +15,18 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package com.cloudera.sqoop.lib; -import java.io.IOException; -import java.nio.ByteBuffer; -import java.nio.CharBuffer; -import java.sql.PreparedStatement; -import java.sql.SQLException; -import java.util.Map; - -import org.apache.hadoop.io.Text; -import org.apache.hadoop.io.Writable; -import org.apache.hadoop.mapreduce.lib.db.DBWritable; /** * Interface implemented by the classes generated by sqoop's orm.ClassWriter. + * + * @deprecated use org.apache.sqoop.lib.SqoopRecord instead. + * @see org.apache.sqoop.lib.SqoopRecord */ -public abstract class SqoopRecord implements Cloneable, DBWritable, - FieldMappable, Writable { +public abstract class SqoopRecord extends org.apache.sqoop.lib.SqoopRecord { public SqoopRecord() { } - - public abstract void parse(CharSequence s) throws RecordParser.ParseError; - public abstract void parse(Text s) throws RecordParser.ParseError; - public abstract void parse(byte [] s) throws RecordParser.ParseError; - public abstract void parse(char [] s) throws RecordParser.ParseError; - public abstract void parse(ByteBuffer s) throws RecordParser.ParseError; - public abstract void parse(CharBuffer s) throws RecordParser.ParseError; - public abstract void loadLargeObjects(LargeObjectLoader objLoader) - throws SQLException, IOException, InterruptedException; - - /** - * Inserts the data in this object into the PreparedStatement, starting - * at parameter 'offset'. - * @return the number of fields written to the statement. - */ - public abstract int write(PreparedStatement stmt, int offset) - throws SQLException; - - /** - * Format output data according to the specified delimiters. - */ - public abstract String toString(DelimiterSet delimiters); - - /** - * Use the default delimiters, but only append an end-of-record delimiter - * if useRecordDelim is true. - */ - public String toString(boolean useRecordDelim) { - // Method body should be overridden by generated classes in 1.3.0+ - if (useRecordDelim) { - // This is the existing functionality. - return toString(); - } else { - // Setting this to false requires behavior in the generated class. - throw new RuntimeException( - "toString(useRecordDelim=false) requires a newer SqoopRecord. " - + "Please regenerate your record class to use this function."); - } - } - - /** - * Format the record according to the specified delimiters. An end-of-record - * delimiter is optional, and only used if useRecordDelim is true. For - * use with TextOutputFormat, calling this with useRecordDelim=false may - * make more sense. - */ - public String toString(DelimiterSet delimiters, boolean useRecordDelim) { - if (useRecordDelim) { - return toString(delimiters); - } else { - // Setting this to false requires behavior in the generated class. - throw new RuntimeException( - "toString(delimiters, useRecordDelim=false) requires a newer " - + "SqoopRecord. Please regenerate your record class to use this " - + "function."); - } - } - - @Override - public Object clone() throws CloneNotSupportedException { - return super.clone(); - } - - /** - * Returns an integer specifying which API format version the - * generated class conforms to. Used by internal APIs for backwards - * compatibility. - * @return the API version this class was generated against. - */ - public abstract int getClassFormatVersion(); - - /** - * Use the delegate pattern to allow arbitrary processing of the - * fields of this record. - * @param processor A delegate that operates on this object. - * @throws IOException if the processor encounters an IO error when - * operating on this object. - * @throws ProcessingException if the FieldMapProcessor encounters - * a general processing error when operating on this object. - */ - public void delegate(FieldMapProcessor processor) - throws IOException, ProcessingException { - processor.accept(this); - } - - @Override - /** - * {@inheriDoc} - * @throws RuntimeException if used with a record that was generated - * before this capability was added (1.1.0). - */ - public Map getFieldMap() { - // Default implementation does not support field iteration. - // ClassWriter should provide an overriding version. - throw new RuntimeException( - "Got null field map from record. Regenerate your record class."); - } - - /** - * Allows an arbitrary field to be set programmatically to the - * specified value object. The value object must match the - * type expected for the particular field or a RuntimeException - * will result. - * @throws RuntimeException if the specified field name does not exist. - */ - public void setField(String fieldName, Object fieldVal) { - throw new RuntimeException("This SqoopRecord does not support setField(). " - + "Regenerate your record class."); - } } diff --git a/src/java/org/apache/sqoop/io/CodecMap.java b/src/java/org/apache/sqoop/io/CodecMap.java new file mode 100644 index 00000000..5b672061 --- /dev/null +++ b/src/java/org/apache/sqoop/io/CodecMap.java @@ -0,0 +1,170 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.sqoop.io; + +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.TreeMap; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.compress.CompressionCodec; +import org.apache.hadoop.io.compress.CompressionCodecFactory; +import org.apache.hadoop.util.ReflectionUtils; + +/** + * Provides a mapping from codec names to concrete implementation class names. + */ +public final class CodecMap { + + // Supported codec map values + // Note: do not add more values here, since codecs are discovered using the + // standard Hadoop mechanism (io.compression.codecs). See + // CompressionCodecFactory. + public static final String NONE = "none"; + public static final String DEFLATE = "deflate"; + public static final String LZO = "lzo"; + public static final String LZOP = "lzop"; + + private static Map codecNames; + static { + codecNames = new TreeMap(); + + // Register the names of codecs we know about. + codecNames.put(NONE, null); + codecNames.put(DEFLATE, "org.apache.hadoop.io.compress.DefaultCodec"); + codecNames.put(LZO, "com.hadoop.compression.lzo.LzoCodec"); + codecNames.put(LZOP, "com.hadoop.compression.lzo.LzopCodec"); + + // add more from Hadoop CompressionCodecFactory + for (Class cls + : CompressionCodecFactory.getCodecClasses(new Configuration())) { + String simpleName = cls.getSimpleName(); + String codecName = simpleName; + if (simpleName.endsWith("Codec")) { + codecName = simpleName.substring(0, simpleName.length() + - "Codec".length()); + } + codecNames.put(codecName.toLowerCase(), cls.getCanonicalName()); + } + } + + private CodecMap() { + } + + /** + * Given a codec name, return the name of the concrete class + * that implements it (or 'null' in the case of the "none" codec). + * @throws com.cloudera.sqoop.io.UnsupportedCodecException if a codec cannot + * be found with the supplied name. + */ + public static String getCodecClassName(String codecName) + throws com.cloudera.sqoop.io.UnsupportedCodecException { + if (!codecNames.containsKey(codecName)) { + throw new com.cloudera.sqoop.io.UnsupportedCodecException(codecName); + } + + return codecNames.get(codecName); + } + + /** + * Given a codec name, instantiate the concrete implementation + * class that implements it. + * @throws com.cloudera.sqoop.io.UnsupportedCodecException if a codec cannot + * be found with the supplied name. + */ + public static CompressionCodec getCodec(String codecName, + Configuration conf) throws com.cloudera.sqoop.io.UnsupportedCodecException { + // Try standard Hadoop mechanism first + CompressionCodec codec = getCodecByName(codecName, conf); + if (codec != null) { + return codec; + } + // Fall back to Sqoop mechanism + String codecClassName = null; + try { + codecClassName = getCodecClassName(codecName); + if (null == codecClassName) { + return null; + } + Class codecClass = + (Class) + conf.getClassByName(codecClassName); + return (CompressionCodec) ReflectionUtils.newInstance( + codecClass, conf); + } catch (ClassNotFoundException cnfe) { + throw new com.cloudera.sqoop.io.UnsupportedCodecException( + "Cannot find codec class " + + codecClassName + " for codec " + codecName); + } + } + + /** + * Return the set of available codec names. + */ + public static Set getCodecNames() { + return codecNames.keySet(); + } + + /** + * Find the relevant compression codec for the codec's canonical class name + * or by codec alias. + *

+ * Codec aliases are case insensitive. + *

+ * The code alias is the short class name (without the package name). + * If the short class name ends with 'Codec', then there are two aliases for + * the codec, the complete short class name and the short class name without + * the 'Codec' ending. For example for the 'GzipCodec' codec class name the + * alias are 'gzip' and 'gzipcodec'. + *

+ * Note: When HADOOP-7323 is available this method can be replaced with a call + * to CompressionCodecFactory. + * @param classname the canonical class name of the codec or the codec alias + * @return the codec object or null if none matching the name were found + */ + private static CompressionCodec getCodecByName(String codecName, + Configuration conf) { + List> codecs = + CompressionCodecFactory.getCodecClasses(conf); + for (Class cls : codecs) { + if (codecMatches(cls, codecName)) { + return ReflectionUtils.newInstance(cls, conf); + } + } + return null; + } + + + private static boolean codecMatches(Class cls, + String codecName) { + String simpleName = cls.getSimpleName(); + if (cls.getName().equals(codecName) + || simpleName.equalsIgnoreCase(codecName)) { + return true; + } + if (simpleName.endsWith("Codec")) { + String prefix = simpleName.substring(0, simpleName.length() + - "Codec".length()); + if (prefix.equalsIgnoreCase(codecName)) { + return true; + } + } + return false; + } +} diff --git a/src/java/org/apache/sqoop/io/FixedLengthInputStream.java b/src/java/org/apache/sqoop/io/FixedLengthInputStream.java new file mode 100644 index 00000000..1cd10cba --- /dev/null +++ b/src/java/org/apache/sqoop/io/FixedLengthInputStream.java @@ -0,0 +1,87 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.sqoop.io; + +import java.io.IOException; +import java.io.InputStream; + +import org.apache.commons.io.input.CloseShieldInputStream; +import org.apache.commons.io.input.CountingInputStream; +import org.apache.commons.io.input.ProxyInputStream; + +/** + * Provides an InputStream that can consume a fixed maximum number of bytes + * from an underlying stream. Closing the FixedLengthInputStream does not + * close the underlying stream. After reading the maximum number of available + * bytes this acts as though EOF has been reached. + */ +public class FixedLengthInputStream extends ProxyInputStream { + + private CountingInputStream countingIn; + private long maxBytes; + + public FixedLengthInputStream(InputStream stream, long maxLen) { + super(new CountingInputStream(new CloseShieldInputStream(stream))); + + // Save a correctly-typed reference to the underlying stream. + this.countingIn = (CountingInputStream) this.in; + this.maxBytes = maxLen; + } + + /** @return the number of bytes already consumed by the client. */ + private long consumed() { + return countingIn.getByteCount(); + } + + /** + * @return number of bytes remaining to be read before the limit + * is reached. + */ + private long toLimit() { + return maxBytes - consumed(); + } + + @Override + public int available() throws IOException { + return (int) Math.min(toLimit(), countingIn.available()); + } + + @Override + public int read() throws IOException { + if (toLimit() > 0) { + return super.read(); + } else { + return -1; // EOF. + } + } + + @Override + public int read(byte [] buf) throws IOException { + return read(buf, 0, buf.length); + } + + @Override + public int read(byte [] buf, int start, int count) throws IOException { + long limit = toLimit(); + if (limit == 0) { + return -1; // EOF. + } else { + return super.read(buf, start, (int) Math.min(count, limit)); + } + } +} diff --git a/src/java/org/apache/sqoop/io/LobFile.java b/src/java/org/apache/sqoop/io/LobFile.java new file mode 100644 index 00000000..4e1bf2cb --- /dev/null +++ b/src/java/org/apache/sqoop/io/LobFile.java @@ -0,0 +1,1821 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.sqoop.io; + +import java.io.BufferedOutputStream; +import java.io.Closeable; +import java.io.DataInput; +import java.io.DataInputStream; +import java.io.DataOutput; +import java.io.DataOutputStream; +import java.io.EOFException; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.OutputStream; +import java.io.OutputStreamWriter; +import java.io.UnsupportedEncodingException; +import java.util.AbstractMap; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.TreeMap; + +import org.apache.commons.io.output.CloseShieldOutputStream; +import org.apache.commons.io.output.CountingOutputStream; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.BytesWritable; +import org.apache.hadoop.io.DataInputBuffer; +import org.apache.hadoop.io.DataOutputBuffer; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.io.WritableUtils; +import org.apache.hadoop.io.compress.CompressionCodec; +import org.apache.hadoop.io.compress.Compressor; +import org.apache.hadoop.io.compress.CompressorStream; +import org.apache.hadoop.io.compress.Decompressor; +import org.apache.hadoop.io.compress.DecompressorStream; + +import com.cloudera.sqoop.io.LobReaderCache; +import com.cloudera.sqoop.util.RandomHash; + +/** + * File format which stores large object records. + * The format allows large objects to be read through individual InputStreams + * to allow reading without full materialization of a single record. + * Each record is assigned an id and can be accessed by id efficiently by + * consulting an index at the end of the file. + * + * The LobFile format is specified at: + * http://wiki.github.com/cloudera/sqoop/sip-3 + */ +public final class LobFile { + + public static final Log LOG = LogFactory.getLog(LobFile.class.getName()); + public static final int LATEST_LOB_VERSION = 0; + + public static final char[] HEADER_ID_STR = { 'L', 'O', 'B' }; + + //Value for entryId to write to the beginning of an IndexSegment. + public static final long SEGMENT_HEADER_ID = -1; + + //Value for entryId to write before the finale. + public static final long SEGMENT_OFFSET_ID = -2; + + //Value for entryID to write before the IndexTable + public static final long INDEX_TABLE_ID = -3; + + private LobFile() { + } + + /** + * Creates a LobFile Reader configured to read from the specified file. + */ + public static com.cloudera.sqoop.io.LobFile.Reader + open(Path p, Configuration conf) throws IOException { + FileSystem fs = p.getFileSystem(conf); + FileStatus [] stats = fs.listStatus(p); + if (null == stats || stats.length == 0) { + throw new IOException("Could not find file: " + p); + } + FSDataInputStream fis = fs.open(p); + DataInputStream dis = new DataInputStream(fis); + LobFileHeader header = new LobFileHeader(dis); + int version = header.getVersion(); + + if (version == 0) { + return new V0Reader(p, conf, header, dis, fis, stats[0].getLen()); + } else { + throw new IOException("No reader available for LobFile version " + + version); + } + } + + /** + * Creates a LobFile Writer. + * @param p the path to create. + * @param conf the configuration to use to interact with the filesystem. + * @param isCharData true if this is for CLOBs, false for BLOBs. + * @param codec the compression codec to use (or null for none). + * @param entriesPerSegment number of entries per index segment. + */ + public static com.cloudera.sqoop.io.LobFile.Writer + create(Path p, Configuration conf, boolean isCharData, + String codec, int entriesPerSegment) + throws IOException { + return new V0Writer(p, conf, isCharData, codec, entriesPerSegment); + } + + /** + * Creates a LobFile Writer. + * @param p the path to create. + * @param conf the configuration to use to interact with the filesystem. + * @param isCharData true if this is for CLOBs, false for BLOBs. + * @param codec the compression codec to use (or null for none). + */ + public static com.cloudera.sqoop.io.LobFile.Writer + create(Path p, Configuration conf, boolean isCharData, + String codec) throws IOException { + return create(p, conf, isCharData, codec, + V0Writer.DEFAULT_MAX_SEGMENT_ENTRIES); + } + + /** + * Creates a LobFile Writer configured for uncompressed data. + * @param p the path to create. + * @param conf the configuration to use to interact with the filesystem. + * @param isCharData true if this is for CLOBs, false for BLOBs. + */ + public static com.cloudera.sqoop.io.LobFile.Writer + create(Path p, Configuration conf, boolean isCharData) + throws IOException { + return create(p, conf, isCharData, null); + } + + /** + * Creates a LobFile Writer configured for uncompressed binary data. + * @param p the path to create. + * @param conf the configuration to use to interact with the filesystem. + */ + public static com.cloudera.sqoop.io.LobFile.Writer + create(Path p, Configuration conf) throws IOException { + return create(p, conf, false); + } + + /** + * Class that writes out a LobFile. Instantiate via LobFile.create(). + */ + public abstract static class Writer implements Closeable { + /** + * If this Writer is writing to a physical LobFile, then this returns + * the file path it is writing to. Otherwise it returns null. + * @return the fully-qualified path being written to by this writer. + */ + public abstract Path getPath(); + + /** + * Finishes writing the LobFile and closes underlying handles. + */ + public abstract void close() throws IOException; + + @Override + protected synchronized void finalize() throws Throwable { + close(); + super.finalize(); + } + + /** + * Terminates the current record and writes any trailing zero-padding + * required by the specified record size. + * This is implicitly called between consecutive writeBlobRecord() / + * writeClobRecord() calls. + */ + public abstract void finishRecord() throws IOException; + + /** + * Declares a new BLOB record to be written to the file. + * @param len the "claimed" number of bytes that will be written to + * this record. The actual number of bytes may differ. + */ + public abstract OutputStream writeBlobRecord(long len) throws IOException; + + /** + * Declares a new CLOB record to be written to the file. + * @param len the claimed number of characters that will be written to + * this record. The actual number of characters may differ. + */ + public abstract java.io.Writer writeClobRecord(long len) + throws IOException; + + /** + * Report the current position in the output file. + * @return the number of bytes written through this Writer. + */ + public abstract long tell() throws IOException; + + /** + * Checks whether an underlying stream is present or null. + * @param out the stream to check for null-ness. + * @throws IOException if out is null. + */ + protected void checkForNull(OutputStream out) throws IOException { + if (null == out) { + throw new IOException("Writer has been closed."); + } + } + } + + /** + * Class that can read a LobFile. Create with LobFile.open(). + */ + public abstract static class Reader implements Closeable { + /** + * If this Reader is reading from a physical LobFile, then this returns + * the file path it is reading from. Otherwise it returns null. + * @return the fully-qualified path being read by this reader. + */ + public abstract Path getPath(); + + /** + * Report the current position in the file. Note that the internal + * cursor may move in an unpredictable fashion; e.g., to fetch + * additional data from the index stored at the end of the file. + * Clients may be more interested in the getRecordOffset() method + * which returns the starting offset of the current record. + * @return the current offset from the start of the file in bytes. + */ + public abstract long tell() throws IOException; + + /** + * Move the file pointer to the first available full record beginning at + * position 'pos', relative to the start of the file. After calling + * seek(), you will need to call next() to move to the record itself. + * @param pos the position to seek to or past. + */ + public abstract void seek(long pos) throws IOException; + + /** + * Advances to the next record in the file. + * @return true if another record exists, or false if the + * end of the file has been reached. + */ + public abstract boolean next() throws IOException; + + /** + * @return true if we have aligned the Reader (through a call to next()) + * onto a record. + */ + public abstract boolean isRecordAvailable(); + + /** + * Reports the length of the record to the user. + * If next() has not been called, or seek() has been called without + * a subsequent call to next(), or next() returned false, the return + * value of this method is undefined. + * @return the 'claimedLen' field of the current record. For + * character-based records, this is often in characters, not bytes. + * Records may have more bytes associated with them than are reported + * by this method, but never fewer. + */ + public abstract long getRecordLen(); + + /** + * Return the entryId of the current record to the user. + * If next() has not been called, or seek() has been called without + * a subsequent call to next(), or next() returned false, the return + * value of this method is undefined. + * @return the 'entryId' field of the current record. + */ + public abstract long getRecordId(); + + /** + * Return the byte offset at which the current record starts. + * If next() has not been called, or seek() has been called without + * a subsequent call to next(), or next() returned false, the return + * value of this method is undefined. + * @return the byte offset of the beginning of the current record. + */ + public abstract long getRecordOffset(); + + /** + * @return an InputStream allowing the user to read the next binary + * record from the file. + */ + public abstract InputStream readBlobRecord() throws IOException; + + /** + * @return a java.io.Reader allowing the user to read the next character + * record from the file. + */ + public abstract java.io.Reader readClobRecord() throws IOException; + + /** + * Closes the reader. + */ + public abstract void close() throws IOException; + + /** + * Checks whether an underlying stream is present or null. + * @param in the stream to check for null-ness. + * @throws IOException if in is null. + */ + protected void checkForNull(InputStream in) throws IOException { + if (null == in) { + throw new IOException("Reader has been closed."); + } + } + + /** + * @return true if the Reader.close() method has been called. + */ + public abstract boolean isClosed(); + + @Override + protected synchronized void finalize() throws Throwable { + close(); + super.finalize(); + } + } + + /** + * Represents a header block in a LobFile. Can write a new header + * block (and generate a record start mark), or read an existing + * header block. + */ + private static class LobFileHeader implements Writable { + + private int version; + private RecordStartMark startMark; + private MetaBlock metaBlock; + + /** + * Create a new LobFileHeader. + */ + public LobFileHeader() { + this.version = LATEST_LOB_VERSION; + this.startMark = new RecordStartMark(); + this.metaBlock = new MetaBlock(); + } + + /** + * Read a LobFileHeader from an existing file. + */ + public LobFileHeader(DataInput in) throws IOException { + readFields(in); + } + + /** + * Write a LobFile header to an output sink. + */ + public void write(DataOutput out) throws IOException { + // Start with the file type identification. + for (char c : HEADER_ID_STR) { + out.writeByte((int) c); + } + + // Write the format version + WritableUtils.writeVInt(out, this.version); + + startMark.write(out); + metaBlock.write(out); + } + + public void readFields(DataInput in) throws IOException { + char [] chars = new char[3]; + for (int i = 0; i < 3; i++) { + chars[i] = (char) in.readByte(); + } + + // Check that these match what we expect. Throws IOE if not. + checkHeaderChars(chars); + + this.version = WritableUtils.readVInt(in); + if (this.version != LATEST_LOB_VERSION) { + // Right now we only have one version we can handle. + throw new IOException("Unexpected LobFile version " + this.version); + } + + this.startMark = new RecordStartMark(in); + this.metaBlock = new MetaBlock(in); + } + + /** + * Checks that a header array matches the standard LobFile header. + * Additional data at the end of the headerStamp is ignored. + * @param headerStamp the header bytes received from the file. + * @throws IOException if it doesn't. + */ + private void checkHeaderChars(char [] headerStamp) throws IOException { + if (headerStamp.length != HEADER_ID_STR.length) { + throw new IOException("Invalid LobFile header stamp: expected length " + + HEADER_ID_STR.length); + } + for (int i = 0; i < HEADER_ID_STR.length; i++) { + if (headerStamp[i] != HEADER_ID_STR[i]) { + throw new IOException("Invalid LobFile header stamp"); + } + } + } + + /** + * @return the format version number for this LobFile + */ + public int getVersion() { + return version; + } + + /** + * @return the RecordStartMark for this LobFile. + */ + public RecordStartMark getStartMark() { + return startMark; + } + + /** + * @return the MetaBlock for this LobFile. + */ + public MetaBlock getMetaBlock() { + return metaBlock; + } + } + + /** + * Holds a RecordStartMark -- a 16 byte randomly-generated + * sync token. Can read a RSM from an input source, or can + * generate a new one. + */ + private static class RecordStartMark implements Writable { + + // This is a 16-byte array. + public static final int START_MARK_LENGTH = 16; + + private byte [] startBytes; + + public RecordStartMark() { + generateStartMark(); + } + + public RecordStartMark(DataInput in) throws IOException { + readFields(in); + } + + public byte [] getBytes() { + byte [] out = new byte[START_MARK_LENGTH]; + System.arraycopy(this.startBytes, 0, out, 0, START_MARK_LENGTH); + return out; + } + + public void readFields(DataInput in) throws IOException { + this.startBytes = new byte[START_MARK_LENGTH]; + in.readFully(this.startBytes); + } + + public void write(DataOutput out) throws IOException { + out.write(this.startBytes); + } + + /** + * Generate a new random RecordStartMark. + */ + private void generateStartMark() { + this.startBytes = RandomHash.generateMD5Bytes(); + } + } + + /** + * Represents the metadata block stored in the header of a LobFile. + */ + private static class MetaBlock extends AbstractMap + implements Writable { + + // Strings which typically appear in the metablock have canonical names. + public static final String ENTRY_ENCODING_KEY = "EntryEncoding"; + public static final String COMPRESSION_CODEC_KEY = "CompressionCodec"; + public static final String ENTRIES_PER_SEGMENT_KEY = "EntriesPerSegment"; + + // Standard entry encodings. + public static final String CLOB_ENCODING = "CLOB"; + public static final String BLOB_ENCODING = "BLOB"; + + private Map entries; + + public MetaBlock() { + entries = new TreeMap(); + } + + public MetaBlock(DataInput in) throws IOException { + entries = new TreeMap(); + readFields(in); + } + + public MetaBlock(Map map) { + entries = new TreeMap(); + for (Map.Entry entry : map.entrySet()) { + entries.put(entry.getKey(), entry.getValue()); + } + } + + @Override + public Set> entrySet() { + return entries.entrySet(); + } + + @Override + public BytesWritable put(String k, BytesWritable v) { + BytesWritable old = entries.get(k); + entries.put(k, v); + return old; + } + + public BytesWritable put(String k, String v) { + try { + return put(k, new BytesWritable(v.getBytes("UTF-8"))); + } catch (UnsupportedEncodingException uee) { + // Shouldn't happen; UTF-8 is always supported. + throw new RuntimeException(uee); + } + } + + @Override + public BytesWritable get(Object k) { + return entries.get(k); + } + + public String getString(Object k) { + BytesWritable bytes = get(k); + if (null == bytes) { + return null; + } else { + try { + return new String(bytes.getBytes(), 0, bytes.getLength(), "UTF-8"); + } catch (UnsupportedEncodingException uee) { + // Shouldn't happen; UTF-8 is always supported. + throw new RuntimeException(uee); + } + } + } + + public void readFields(DataInput in) throws IOException { + int numEntries = WritableUtils.readVInt(in); + entries.clear(); + for (int i = 0; i < numEntries; i++) { + String key = Text.readString(in); + BytesWritable val = new BytesWritable(); + val.readFields(in); + entries.put(key, val); + } + } + + public void write(DataOutput out) throws IOException { + int numEntries = entries.size(); + WritableUtils.writeVInt(out, numEntries); + for (Map.Entry entry : entries.entrySet()) { + Text.writeString(out, entry.getKey()); + entry.getValue().write(out); + } + } + } + + + /** + * Describes an IndexSegment. This is one entry in the IndexTable. It + * holds the physical location of the IndexSegment in the file, as well + * as the range of entryIds and byte ranges corresponding to records + * described by the index subset in the IndexSegment. + */ + private static class IndexTableEntry implements Writable { + private long segmentOffset; + private long firstIndexId; + private long firstIndexOffset; + private long lastIndexOffset; + + public IndexTableEntry() { + } + + public IndexTableEntry(DataInput in) throws IOException { + readFields(in); + } + + private void setSegmentOffset(long offset) { + this.segmentOffset = offset; + } + + private void setFirstIndexId(long id) { + this.firstIndexId = id; + } + + private void setFirstIndexOffset(long offset) { + this.firstIndexOffset = offset; + } + + private void setLastIndexOffset(long offset) { + this.lastIndexOffset = offset; + } + + public void write(DataOutput out) throws IOException { + WritableUtils.writeVLong(out, segmentOffset); + WritableUtils.writeVLong(out, firstIndexId); + WritableUtils.writeVLong(out, firstIndexOffset); + WritableUtils.writeVLong(out, lastIndexOffset); + } + + public void readFields(DataInput in) throws IOException { + segmentOffset = WritableUtils.readVLong(in); + firstIndexId = WritableUtils.readVLong(in); + firstIndexOffset = WritableUtils.readVLong(in); + lastIndexOffset = WritableUtils.readVLong(in); + } + + /** + * @return the entryId of the first record indexed by this segment. + */ + public long getFirstIndexId() { + return this.firstIndexId; + } + + /** + * @return the offset of the first record indexed by this segment. + */ + public long getFirstIndexOffset() { + return this.firstIndexOffset; + } + + /** + * @return the offset of the last record indexed by this segment. + */ + public long getLastIndexOffset() { + return this.lastIndexOffset; + } + + /** + * @return the offset from the start of the file of the IndexSegment + * data itself. + */ + public long getSegmentOffset() { + return this.segmentOffset; + } + + /** + * Inform whether the user's requested offset corresponds + * to a record that starts in this IndexSegment. If this + * returns true, the requested offset may actually be in + * a previous IndexSegment. + * @param off the offset of the start of a record to test. + * @return true if the user's requested offset is in this + * or a previous IndexSegment. + */ + public boolean containsOffset(long off) { + return off <= getLastIndexOffset(); + } + } + + /** + * Class that represents the IndexSegment entries in a LobIndex. + */ + private static class IndexSegment implements Writable { + + // The main body of the IndexSegment: the record lengths + // of all the records in the IndexSegment. + private BytesWritable recordLenBytes; + + // The length of the previously recorded field (used when + // generating an index). Intermediate state used in calculation + // of the lastIndexOffset. + private long prevLength; + + // Used to write VLong-encoded lengths into a temp + // array, which are then copied into recordLenBytes. + private DataOutputBuffer outputBuffer; + + // The IndexTableEntry that describes this IndexSegment in the IndexTable. + private IndexTableEntry tableEntry; + + public IndexSegment(IndexTableEntry tableEntry) { + this.recordLenBytes = new BytesWritable(); + this.outputBuffer = new DataOutputBuffer(10); // max VLong size. + this.tableEntry = tableEntry; + } + + /** + * Read an IndexSegment from an existing file. + */ + public IndexSegment(IndexTableEntry tableEntry, DataInput in) + throws IOException { + this.recordLenBytes = new BytesWritable(); + this.outputBuffer = new DataOutputBuffer(10); + this.tableEntry = tableEntry; + readFields(in); + } + + /** + * @return the IndexTableEntry describing this IndexSegment in the + * IndexTable. + */ + public IndexTableEntry getTableEntry() { + return tableEntry; + } + + /** + * Add a recordLength to the recordLenBytes array. + */ + public void addRecordLen(long recordLen) throws IOException { + // Allocate space for the new bytes. + int numBytes = WritableUtils.getVIntSize(recordLen); + recordLenBytes.setSize(recordLenBytes.getLength() + numBytes); + + // Write the new bytes into a temporary buffer wrapped in a DataOutput. + outputBuffer.reset(); + WritableUtils.writeVLong(outputBuffer, recordLen); + + // Then copy those new bytes into the end of the recordLenBytes array. + System.arraycopy(outputBuffer.getData(), 0, recordLenBytes.getBytes(), + recordLenBytes.getLength() - numBytes, numBytes); + + // Now that we've added a new recordLength to the array, + // it's the last index. We need to calculate its offset. + // This is based on how long the previous record was. + this.tableEntry.setLastIndexOffset( + this.tableEntry.getLastIndexOffset() + this.prevLength); + + // Save this record's length (unserialized) for calculating + // lastIndexOffset for the next record. + this.prevLength = recordLen; + } + + public void write(DataOutput out) throws IOException { + // Write the SEGMENT_HEADER_ID to distinguish this from a LobRecord. + WritableUtils.writeVLong(out, SEGMENT_HEADER_ID); + + // The length of the main body of the segment is the length of the + // data byte array. + int segmentBytesLen = recordLenBytes.getLength(); + WritableUtils.writeVLong(out, segmentBytesLen); + + // Write the body of the segment. + out.write(recordLenBytes.getBytes(), 0, segmentBytesLen); + } + + public void readFields(DataInput in) throws IOException { + // After the RecordStartMark, we expect to get a SEGMENT_HEADER_ID (-1). + long segmentId = WritableUtils.readVLong(in); + if (SEGMENT_HEADER_ID != segmentId) { + throw new IOException("Expected segment header id " + SEGMENT_HEADER_ID + + "; got " + segmentId); + } + + // Get the length of the rest of the segment, in bytes. + long length = WritableUtils.readVLong(in); + + // Now read the actual main byte array. + if (length > Integer.MAX_VALUE) { + throw new IOException("Unexpected oversize data array length: " + + length); + } else if (length < 0) { + throw new IOException("Unexpected undersize data array length: " + + length); + } + byte [] segmentData = new byte[(int) length]; + in.readFully(segmentData); + recordLenBytes = new BytesWritable(segmentData); + + reset(); // Reset the iterator allowing the user to yield offset/lengths. + } + + + // The following methods are used by a Reader to walk through the index + // segment and get data about the records described in this segment of + // the index. + + private DataInputBuffer dataInputBuf; + + // The following two fields are advanced by the next() method. + private long curOffset; // offset into the file of the current record. + private long curLen; // length of the current record in bytes. + + // Used to allow rewindOnce() to go backwards a single position in the + // iterator. + private int prevInputBufPos; // prev offset into dataInputBuf. + private long prevOffset; + private long prevLen; + + /** + * Resets the record index iterator. + */ + public void reset() { + this.dataInputBuf = null; + } + + /** + * Aligns the iteration capability to return info about the next + * record in the IndexSegment. Must be called before the first + * record. + * @return true if there is another record described in this IndexSegment. + */ + public boolean next() { + this.prevOffset = this.curOffset; + if (null == dataInputBuf) { + // We need to set up the iterator; this is the first use. + if (null == recordLenBytes) { + return false; // We don't have any records? + } + + this.dataInputBuf = new DataInputBuffer(); + this.dataInputBuf.reset(recordLenBytes.getBytes(), + 0, recordLenBytes.getLength()); + + this.curOffset = this.tableEntry.getFirstIndexOffset(); + this.prevOffset = 0; + } else { + this.curOffset += this.curLen; + } + + boolean available = dataInputBuf.getPosition() < dataInputBuf.getLength(); + if (available) { + this.prevInputBufPos = dataInputBuf.getPosition(); + // Then read out the next record length. + try { + this.prevLen = this.curLen; + this.curLen = WritableUtils.readVLong(dataInputBuf); + } catch (IOException ioe) { + // Shouldn't happen; data in DataInputBuffer is materialized. + throw new RuntimeException(ioe); + } + } + + return available; + } + + /** + * Undoes a single call to next(). This cannot be called twice in a row; + * before calling this again, next() must be called in the interim. This + * makes a subsequent call to next() yield the same iterated values as the + * previous call. + */ + public void rewindOnce() { + // Move the buffer backwards so we deserialize the same VLong with + // the next call. + if (prevInputBufPos == 0) { + // We actually rewound the first next() in the iterator. + // Just reset the iterator to the beginning. Otherwise we'll + // backfill it with bogus data. + reset(); + } else { + // Use the normal codepath; move the serialization buffer + // backwards and restores the previously yielded values. + dataInputBuf.reset(recordLenBytes.getBytes(), prevInputBufPos, + recordLenBytes.getLength() - prevInputBufPos); + + // And restore the previously-yielded values. + this.curLen = this.prevLen; + this.curOffset = this.prevOffset; + } + } + + /** + * Returns the length of the current record. + * You must call next() and it must return true before calling this method. + * @return the length in bytes of the current record. + */ + public long getCurRecordLen() { + return curLen; + } + + /** + * Returns the offset of the current record from the beginning of the file. + * You must call next() and it must return true before calling this method. + * @return the offset in bytes from the beginning of the file for the + * current record. + */ + public long getCurRecordStart() { + return curOffset; + } + } + + /** + * Stores the locations and ranges indexed by each IndexSegment. + */ + private static class IndexTable + implements Iterable, Writable { + private List tableEntries; + + public IndexTable() { + tableEntries = new ArrayList(); + } + + public IndexTable(DataInput in) throws IOException { + readFields(in); + } + + public void readFields(DataInput in) throws IOException { + long recordTypeId = WritableUtils.readVLong(in); + if (recordTypeId != INDEX_TABLE_ID) { + // We expected to read an IndexTable. + throw new IOException("Expected IndexTable; got record with typeId=" + + recordTypeId); + } + + int tableCount = WritableUtils.readVInt(in); + + tableEntries = new ArrayList(tableCount); + for (int i = 0; i < tableCount; i++) { + tableEntries.add(new IndexTableEntry(in)); + } + } + + public void write(DataOutput out) throws IOException { + // Start with the record type id. + WritableUtils.writeVLong(out, INDEX_TABLE_ID); + + // Then the count of the records. + WritableUtils.writeVInt(out, tableEntries.size()); + + // Followed by the table itself. + for (IndexTableEntry entry : tableEntries) { + entry.write(out); + } + } + + public void add(IndexTableEntry entry) { + tableEntries.add(entry); + } + + public IndexTableEntry get(int i) { + return tableEntries.get(i); + } + + public int size() { + return tableEntries.size(); + } + + public Iterator iterator() { + return tableEntries.iterator(); + } + } + + /** + * Reader implementation for LobFile format version 0. Acquire with + * LobFile.open(). + */ + private static class V0Reader extends com.cloudera.sqoop.io.LobFile.Reader { + public static final Log LOG = LogFactory.getLog( + V0Reader.class.getName()); + + // Forward seeks of up to this size are performed by reading, not seeking. + private static final long MAX_CONSUMPTION_WIDTH = 512 * 1024; + + private LobFileHeader header; + + private Configuration conf; + + // Codec to use to decompress the file. + private CompressionCodec codec; + private Decompressor decompressor; + + // Length of the entire file. + private long fileLen; + + // State bit set to true after we've called next() and successfully + // aligned on a record. If true, we can hand an InputStream back to + // the user. + private boolean isAligned; + + // After we've aligned on a record, this contains the record's + // reported length. In the presence of compression, etc, this may + // not represent its true length in the file. + private long claimedRecordLen; + + // After we've aligned on a record, this contains its entryId. + private long curEntryId; + + // After we've aligned on a record, this contains the offset of the + // beginning of its RSM from the start of the file. + private long curRecordOffset; + + // After we've aligned on a record, this contains the record's + // true length from the index. + private long indexRecordLen; + + // tmp buffer used to consume RecordStartMarks during alignment. + private byte [] tmpRsmBuf; + + // The actual file stream itself, which we can move around (e.g. with + // seeking). + private FSDataInputStream underlyingInput; + + // The data deserializer we typically place on top of this. + // If we use underlyingInput.seek(), then we instantiate a new + // dataIn on top of it. + private DataInputStream dataIn; + + // The user accesses the current record through a stream memoized here. + // We retain a pointer here so that we can forcibly close the old + // userInputStream when they want to align on the next record. + private InputStream userInputStream; + + // The current index segment to read record lengths from. + private IndexSegment curIndexSegment; + + // The offset into the indexTable of the curIndexSegment. + private int curIndexSegmentId; + + // The IndexTable that provides fast pointers to the IndexSegments. + private IndexTable indexTable; + + // The path being opened. + private Path path; + + // Users should use LobFile.open() instead of directly calling this. + V0Reader(Path path, Configuration conf, LobFileHeader header, + DataInputStream dis, FSDataInputStream stream, long fileLen) + throws IOException { + this.path = LobReaderCache.qualify(path, conf); + this.conf = conf; + this.header = header; + this.dataIn = dis; + this.underlyingInput = stream; + this.isAligned = false; + this.tmpRsmBuf = new byte[RecordStartMark.START_MARK_LENGTH]; + this.fileLen = fileLen; + LOG.debug("Opening LobFile path: " + path); + openCodec(); + openIndex(); + } + + /** + * If the user has specified a compression codec in the header metadata, + * create an instance of it. + */ + private void openCodec() throws IOException { + String codecName = header.getMetaBlock().getString( + MetaBlock.COMPRESSION_CODEC_KEY); + if (null != codecName) { + LOG.debug("Decompressing file with codec: " + codecName); + this.codec = CodecMap.getCodec(codecName, conf); + if (null != this.codec) { + this.decompressor = codec.createDecompressor(); + } + } + } + + /** + * Get the first index segment out of the file; determine + * where that is by loading the index locator at the end of + * the file. + */ + private void openIndex() throws IOException { + // Jump to the end of the file. + // At the end of the file is a RSM followed by two VLongs; + // the first of these is the value -2 (one byte) and the + // second of these is the offset of the beginning of the index (up to + // 9 bytes). + internalSeek(fileLen - RecordStartMark.START_MARK_LENGTH - 10); + + byte [] finaleBuffer = new byte[RecordStartMark.START_MARK_LENGTH + 10]; + this.dataIn.readFully(finaleBuffer); + + // Figure out where in the finaleBuffer the RSM actually starts, + // as the finale might not fully fill the finaleBuffer. + int rsmStart = findRecordStartMark(finaleBuffer); + if (-1 == rsmStart) { + throw new IOException( + "Corrupt file index; could not find index start offset."); + } + + // Wrap a buffer around those two vlongs. + int vlongStart = rsmStart + RecordStartMark.START_MARK_LENGTH; + DataInputBuffer inBuf = new DataInputBuffer(); + inBuf.reset(finaleBuffer, vlongStart, finaleBuffer.length - vlongStart); + + long offsetMarker = WritableUtils.readVLong(inBuf); + if (SEGMENT_OFFSET_ID != offsetMarker) { + // This isn't the correct signature; we got an RSM ahead of some + // other data. + throw new IOException("Invalid segment offset id: " + offsetMarker); + } + + // This will contain the position of the IndexTable. + long indexTableStart = WritableUtils.readVLong(inBuf); + LOG.debug("IndexTable begins at " + indexTableStart); + + readIndexTable(indexTableStart); + + // Set up to read records from the beginning of the file. This + // starts with the first IndexSegment. + curIndexSegmentId = 0; + loadIndexSegment(); + + // This has moved the file pointer all over but we don't need to + // worry about resetting it now. The next() method will seek the + // file pointer to the first record when the user is ready to + // consume it. + } + + /** + * Load the entire IndexTable into memory and decode it. + */ + private void readIndexTable(long indexTableOffset) throws IOException { + internalSeek(indexTableOffset); + + // Read the RecordStartMark ahead of the IndexTable. + this.dataIn.readFully(tmpRsmBuf); + if (!matchesRsm(tmpRsmBuf)) { + throw new IOException("Expected record start mark before IndexTable"); + } + + this.indexTable = new IndexTable(dataIn); + } + + /** + * Ingest the next IndexSegment. + */ + private void readNextIndexSegment() throws IOException { + this.curIndexSegmentId++; + loadIndexSegment(); + } + + /** + * Load curIndexSegment with the segment specified by curIndexSegmentId. + * The file pointer will be moved to the position after this segment. + * If the segment id does not exist, then the curIndexSegment will be + * set to null. + */ + private void loadIndexSegment() throws IOException { + if (indexTable.size() <= curIndexSegmentId || curIndexSegmentId < 0) { + // We've iterated past the last IndexSegment. Set this to null + // and return; the next() method will then return false. + this.curIndexSegment = null; + return; + } + + // Otherwise, seek to the segment and load it. + IndexTableEntry tableEntry = indexTable.get(curIndexSegmentId); + long segmentOffset = tableEntry.getSegmentOffset(); + internalSeek(segmentOffset); + readPositionedIndexSegment(); + } + + /** + * When the underlying stream is aligned on the RecordStartMark + * ahead of an IndexSegment, read in the next IndexSegment. + * After this method the curIndexSegment contains the next + * IndexSegment to read in the file; if the entire index has been + * read in this fastion, curIndexSegment will be null. + */ + private void readPositionedIndexSegment() throws IOException { + if (LOG.isDebugEnabled()) { + LOG.debug("Reading index segment at " + tell()); + } + + // Read the RecordStartMark ahead of the IndexSegment. + this.dataIn.readFully(tmpRsmBuf); + if (!matchesRsm(tmpRsmBuf)) { + throw new IOException("Expected record start mark before IndexSegment"); + } + + // Read the IndexSegment proper. + this.curIndexSegment = new IndexSegment( + this.indexTable.get(curIndexSegmentId), this.dataIn); + } + + /** + * @return true if the bytes in 'buf' starting at 'offset' match + * the RecordStartMark. + * @param rsm the RecordStartMark + * @param buf the buffer to check + * @param offset the offset into buf to begin checking. + */ + private boolean matchesRsm(byte [] rsm, byte [] buf, int offset) { + for (int i = 0; i < RecordStartMark.START_MARK_LENGTH; i++) { + if (buf[i + offset] != rsm[i]) { + return false; // Mismatch at position i. + } + } + + return true; // Matched the whole thing. + } + + private boolean matchesRsm(byte [] buf, int offset) { + return matchesRsm(this.header.getStartMark().getBytes(), + buf, offset); + } + + private boolean matchesRsm(byte [] buf) { + return matchesRsm(buf, 0); + } + + /** + * @return the offset in 'buf' where a RecordStartMark begins, or -1 + * if the RecordStartMark is not present in the buffer. + */ + private int findRecordStartMark(byte [] buf) { + byte [] rsm = this.header.getStartMark().getBytes(); + + for (int i = 0; i < buf.length; i++) { + if (matchesRsm(rsm, buf, i)) { + return i; + } + } + + return -1; // couldn't find it. + } + + @Override + /** {@inheritDoc} */ + public Path getPath() { + return this.path; + } + + @Override + /** {@inheritDoc} */ + public long tell() throws IOException { + checkForNull(this.underlyingInput); + return this.underlyingInput.getPos(); + } + + @Override + /** {@inheritDoc} */ + public void seek(long pos) throws IOException { + closeUserStream(); + checkForNull(this.underlyingInput); + this.isAligned = false; + searchForRecord(pos); + } + + /** + * Search the index for the first record starting on or after 'start'. + * @param start the offset in the file where we should start looking + * for a record. + */ + private void searchForRecord(long start) throws IOException { + LOG.debug("Looking for the first record at/after offset " + start); + + // Scan through the IndexTable until we find the IndexSegment + // that contains the offset. + for (int i = 0; i < indexTable.size(); i++) { + IndexTableEntry tableEntry = indexTable.get(i); + if (LOG.isDebugEnabled()) { + LOG.debug("Checking index table entry for range: " + + tableEntry.getFirstIndexOffset() + ", " + + tableEntry.getLastIndexOffset()); + } + + if (tableEntry.containsOffset(start)) { + // Seek to the IndexSegment associated with this tableEntry. + curIndexSegmentId = i; + loadIndexSegment(); + + // Use this index segment. The record index iterator + // is at the beginning of the IndexSegment, since we just + // read it in. + LOG.debug("Found matching index segment."); + while (this.curIndexSegment.next()) { + long curStart = this.curIndexSegment.getCurRecordStart(); + if (curStart >= start) { + LOG.debug("Found seek target record with offset " + curStart); + // This is the first record to meet this criterion. + // Rewind the index iterator by one so that the next() + // method will do the right thing. next() will also + // take care of actually seeking to the correct position + // in the file to read the record proper. + this.curIndexSegment.rewindOnce(); + return; + } + } + + // If it wasn't actually in this IndexSegment, then we've + // got a corrupt IndexTableEntry; the entry represented that + // the segment ran longer than it actually does. + throw new IOException("IndexTableEntry claims last offset of " + + tableEntry.getLastIndexOffset() + + " but IndexSegment ends early." + + " The IndexTable appears corrupt."); + } + } + + // If we didn't return inside the loop, then we've searched the entire + // file and it's not there. Advance the IndexSegment iterator to + // the end of the road so that next() returns false. + this.curIndexSegmentId = indexTable.size(); + loadIndexSegment(); + } + + /** + * Read data from the stream and discard it. + * @param numBytes number of bytes to read and discard. + */ + private void consumeBytes(int numBytes) throws IOException { + int remaining = numBytes; + while (remaining > 0) { + int received = dataIn.skipBytes(remaining); + if (received < 1) { + throw new IOException("Could not consume additional bytes"); + } + remaining -= received; + } + } + + /** + * Seek to position 'pos' (offset from start of file). If this + * is nearby, actually just consume data from the underlying + * stream rather than doing a real seek. + * @param targetPos the position to seek to, expressed as an offset + * from the start of the file. + */ + private void internalSeek(long targetPos) throws IOException { + long curPos = this.underlyingInput.getPos(); + LOG.debug("Internal seek: target=" + targetPos + "; cur=" + curPos); + long distance = targetPos - curPos; + if (targetPos == curPos) { + LOG.debug("(no motion required)"); + return; // We're already there! + } else if (targetPos > curPos && distance < MAX_CONSUMPTION_WIDTH) { + // We're "close enough" that we should just read it. + LOG.debug("Advancing by " + distance + " bytes."); + consumeBytes((int) distance); + } else { + LOG.debug("Direct seek to target"); + this.underlyingInput.seek(targetPos); + this.dataIn = new DataInputStream(this.underlyingInput); + } + } + + /** + * Close any stream to an open record that was opened by a user. + */ + private void closeUserStream() throws IOException { + if (this.userInputStream != null) { + this.userInputStream.close(); + this.userInputStream = null; + } + } + + @Override + /** {@inheritDoc} */ + public boolean next() throws IOException { + LOG.debug("Checking for next record"); + checkForNull(this.underlyingInput); + // If the user has opened a record stream, it is now void. + closeUserStream(); + this.isAligned = false; // false until proven true. + + // Get the position of the next record start. + // Check the index: is there another record? + if (null == curIndexSegment) { + LOG.debug("Index is finished; false"); + return false; // No index remains. Ergo, no more records. + } + boolean moreInSegment = curIndexSegment.next(); + if (!moreInSegment) { + // The current IndexSegment has been exhausted. Move to the next. + LOG.debug("Loading next index segment."); + readNextIndexSegment(); + if (null == curIndexSegment) { + LOG.debug("Index is finished; false"); + return false; // No index; no records. + } + + // Try again with the next IndexSegment. + moreInSegment = curIndexSegment.next(); + } + + if (!moreInSegment) { + // Nothing left in the last IndexSegment. + LOG.debug("Last index segment is finished; false."); + this.curIndexSegment = null; + return false; + } + + // Determine where the next record starts. + this.indexRecordLen = this.curIndexSegment.getCurRecordLen(); + this.curRecordOffset = this.curIndexSegment.getCurRecordStart(); + + LOG.debug("Next record starts at position: " + this.curRecordOffset + + "; indexedLen=" + this.indexRecordLen); + + // Make sure we're at the target position. + internalSeek(this.curRecordOffset); + + // We are now on top of the next record's RecordStartMark. + // Consume the RSM and the record header. + this.dataIn.readFully(this.tmpRsmBuf); + if (!matchesRsm(tmpRsmBuf)) { + // No rsm? No dice. + throw new IOException("Index contains bogus offset."); + } + + this.curEntryId = WritableUtils.readVLong(this.dataIn); + if (this.curEntryId < 0) { + // We've moved past the end of the records and started + // trying to consume the index. This is the EOF from + // the client's perspective. + LOG.debug("Indexed position is itself an IndexSegment; false."); + return false; + } + LOG.debug("Aligned on record id=" + this.curEntryId); + + this.claimedRecordLen = WritableUtils.readVLong(this.dataIn); + LOG.debug("Record has claimed length " + this.claimedRecordLen); + // We are now aligned on the start of the user's data. + this.isAligned = true; + return true; + } + + @Override + /** {@inheritDoc} */ + public boolean isRecordAvailable() { + return this.isAligned; + } + + @Override + /** {@inheritDoc} */ + public long getRecordLen() { + return this.claimedRecordLen; + } + + @Override + /** {@inheritDoc} */ + public long getRecordId() { + return this.curEntryId; + } + + @Override + /** {@inheritDoc} */ + public long getRecordOffset() { + return this.curRecordOffset; + } + + @Override + /** {@inheritDoc} */ + public InputStream readBlobRecord() throws IOException { + if (!isRecordAvailable()) { + // we're not currently aligned on a record-start. + // Try to get the next one. + if (!next()) { + // No more records available. + throw new EOFException("End of file reached."); + } + } + + // Ensure any previously-open user record stream is closed. + closeUserStream(); + + // Mark this record as consumed. + this.isAligned = false; + + // The length of the stream we can return to the user is + // the indexRecordLen minus the length of any per-record headers. + // That includes the RecordStartMark, the entryId, and the claimedLen. + long streamLen = this.indexRecordLen - RecordStartMark.START_MARK_LENGTH + - WritableUtils.getVIntSize(this.curEntryId) + - WritableUtils.getVIntSize(this.claimedRecordLen); + LOG.debug("Yielding stream to user with length " + streamLen); + this.userInputStream = new FixedLengthInputStream(this.dataIn, streamLen); + if (this.codec != null) { + // The user needs to decompress the data; wrap the InputStream. + decompressor.reset(); + this.userInputStream = new DecompressorStream( + this.userInputStream, decompressor); + } + return this.userInputStream; + } + + @Override + /** {@inheritDoc} */ + public java.io.Reader readClobRecord() throws IOException { + // Get a handle to the binary reader and then wrap it. + InputStream is = readBlobRecord(); + return new InputStreamReader(is); + } + + @Override + /** {@inheritDoc} */ + public void close() throws IOException { + closeUserStream(); + + if (null != dataIn) { + dataIn.close(); + dataIn = null; + } + + if (null != underlyingInput) { + underlyingInput.close(); + underlyingInput = null; + } + + this.isAligned = false; + } + + @Override + /** {@inheritDoc} */ + public boolean isClosed() { + return this.underlyingInput == null; + } + } + + + /** + * Concrete writer implementation for LobFile format version 0. + * Instantiate via LobFile.create(). + */ + private static class V0Writer extends com.cloudera.sqoop.io.LobFile.Writer { + public static final Log LOG = LogFactory.getLog( + V0Writer.class.getName()); + + private Configuration conf; + private Path path; + private boolean isCharData; + private LobFileHeader header; + + private String codecName; + private CompressionCodec codec; + private Compressor compressor; + + // The LobIndex we are constructing. + private LinkedList indexSegments; + // Number of entries in the current IndexSegment. + private int entriesInSegment; + private IndexTable indexTable; + + // Number of entries that can be written to a single IndexSegment. + private int maxEntriesPerSegment; + + // By default we write this many entries per IndexSegment. + static final int DEFAULT_MAX_SEGMENT_ENTRIES = 4096; + + // Our OutputStream to the underlying file. + private DataOutputStream out; + + // 'out' is layered on top of this stream, which gives us a count + // of how much data we've written so far. + private CountingOutputStream countingOut; + + // State regarding the current record being written. + private long curEntryId; // entryId of the current LOB being written. + private long curClaimedLen; // The user claims a length for a record. + + // The user's OutputStream and/or Writer that writes to us. + private OutputStream userOutputStream; + private java.io.Writer userWriter; + + // The userCountingOutputStream may be the same as userOutputStream; + // but if the user is writing through a compressor, it is actually + // underneath of it. This tells us how many compressed bytes were + // really written. + private CountingOutputStream userCountingOutputStream; + + /** + * Creates a LobFile Writer for file format version 0. + * @param p the path to create. + * @param conf the configuration to use to interact with the filesystem. + * @param isCharData true if this is for CLOBs, false for BLOBs. + * @param codecName the compression codec to use (or null for none). + * @param entriesPerSegment the number of index entries per IndexSegment. + */ + V0Writer(Path p, Configuration conf, boolean isCharData, + String codecName, int entriesPerSegment) throws IOException { + + this.path = LobReaderCache.qualify(p, conf); + this.conf = conf; + this.isCharData = isCharData; + this.header = new LobFileHeader(); + this.indexSegments = new LinkedList(); + this.indexTable = new IndexTable(); + this.maxEntriesPerSegment = entriesPerSegment; + + this.codecName = codecName; + if (this.codecName != null) { + this.codec = CodecMap.getCodec(codecName, conf); + if (null != this.codec) { + this.compressor = codec.createCompressor(); + } + } + + init(); + } + + /** + * Open the file and write its header. + */ + private void init() throws IOException { + FileSystem fs = this.path.getFileSystem(conf); + FSDataOutputStream fsOut = fs.create(this.path); + this.countingOut = new CountingOutputStream( + new BufferedOutputStream(fsOut)); + this.out = new DataOutputStream(this.countingOut); + + // put any necessary config strings into the header. + MetaBlock m = this.header.getMetaBlock(); + if (isCharData) { + m.put(MetaBlock.ENTRY_ENCODING_KEY, MetaBlock.CLOB_ENCODING); + } else { + m.put(MetaBlock.ENTRY_ENCODING_KEY, MetaBlock.BLOB_ENCODING); + } + + if (null != codec) { + m.put(MetaBlock.COMPRESSION_CODEC_KEY, this.codecName); + } + + // Serialize the value of maxEntriesPerSegment as a VInt in a byte array + // and put that into the metablock as ENTRIES_PER_SEGMENT_KEY. + int segmentBufLen = WritableUtils.getVIntSize(this.maxEntriesPerSegment); + DataOutputBuffer entriesPerSegBuf = new DataOutputBuffer(segmentBufLen); + WritableUtils.writeVInt(entriesPerSegBuf, this.maxEntriesPerSegment); + byte [] entriesPerSegArray = + Arrays.copyOf(entriesPerSegBuf.getData(), segmentBufLen); + m.put(MetaBlock.ENTRIES_PER_SEGMENT_KEY, + new BytesWritable(entriesPerSegArray)); + + // Write the file header to the file. + this.header.write(out); + + // Now we're ready to accept record data from the user. + } + + @Override + /** {@inheritDoc} */ + public Path getPath() { + return this.path; + } + + @Override + /** + * {@inheritDoc} + */ + public long tell() throws IOException { + checkForNull(this.out); + this.out.flush(); + return this.countingOut.getByteCount(); + } + + @Override + /** + * {@inheritDoc} + */ + public void close() throws IOException { + finishRecord(); + writeIndex(); + if (this.out != null) { + this.out.close(); + this.out = null; + } + + if (this.countingOut != null) { + this.countingOut.close(); + this.countingOut = null; + } + } + + @Override + /** + * {@inheritDoc} + */ + public void finishRecord() throws IOException { + if (null != this.userWriter) { + this.userWriter.close(); + this.userWriter = null; + } + + if (null != this.userCountingOutputStream) { + + // If there is a wrapping stream for compression, + // close this first. + if (null != this.userOutputStream + && this.userOutputStream != this.userCountingOutputStream) { + this.userOutputStream.close(); + } + + // Now close the "main" stream. + this.userCountingOutputStream.close(); + + // Write the true length of the current record to the index. + updateIndex(this.userCountingOutputStream.getByteCount() + + RecordStartMark.START_MARK_LENGTH + + WritableUtils.getVIntSize(curEntryId) + + WritableUtils.getVIntSize(curClaimedLen)); + + this.userOutputStream = null; + this.userCountingOutputStream = null; + } + + if (null != this.out) { + out.flush(); + } + } + + /** + * Write in the current IndexSegment, the true compressed length of the + * record we just finished writing. + * @param curRecordLen the true length in bytes of the compressed record. + */ + private void updateIndex(long curRecordLen) throws IOException { + LOG.debug("Adding index entry: id=" + curEntryId + + "; len=" + curRecordLen); + indexSegments.getLast().addRecordLen(curRecordLen); + entriesInSegment++; + curEntryId++; + } + + /** + * Write the index itself to the file. + */ + private void writeIndex() throws IOException { + + // Write out all the segments in turn. + // As we do so, reify their offsets into the IndexTable. + for (IndexSegment segment : indexSegments) { + long segmentOffset = tell(); + segment.getTableEntry().setSegmentOffset(segmentOffset); + + header.getStartMark().write(out); + segment.write(out); + } + + long indexTableStartPos = tell(); // Save for the end of the file. + LOG.debug("IndexTable offset: " + indexTableStartPos); + + header.getStartMark().write(out); + indexTable.write(out); // write the IndexTable record. + + // Write the finale that tells us where the IndexTable begins. + header.getStartMark().write(out); + WritableUtils.writeVLong(out, SEGMENT_OFFSET_ID); + WritableUtils.writeVLong(out, indexTableStartPos); + } + + /** + * Prepare to index a new record that will soon be written to the file. + * If this is is the first record in the current IndexSegment, we need + * to record its entryId and the current file position. + */ + private void startRecordIndex() throws IOException { + if (entriesInSegment == maxEntriesPerSegment + || indexSegments.size() == 0) { + // The current segment is full. Start a new one. + this.entriesInSegment = 0; + IndexTableEntry tableEntry = new IndexTableEntry(); + IndexSegment curSegment = new IndexSegment(tableEntry); + this.indexSegments.add(curSegment); + + long filePos = tell(); + LOG.debug("Starting IndexSegment; first id=" + curEntryId + + "; off=" + filePos); + tableEntry.setFirstIndexId(curEntryId); + tableEntry.setFirstIndexOffset(filePos); + tableEntry.setLastIndexOffset(filePos); + this.indexTable.add(tableEntry); + } + } + + @Override + /** + * {@inheritDoc} + */ + public OutputStream writeBlobRecord(long claimedLen) throws IOException { + finishRecord(); // finish any previous record. + checkForNull(this.out); + startRecordIndex(); + this.header.getStartMark().write(out); + LOG.debug("Starting new record; id=" + curEntryId + + "; claimedLen=" + claimedLen); + WritableUtils.writeVLong(out, curEntryId); + WritableUtils.writeVLong(out, claimedLen); + this.curClaimedLen = claimedLen; + this.userCountingOutputStream = new CountingOutputStream( + new CloseShieldOutputStream(out)); + if (null == this.codec) { + // No codec; pass thru the same OutputStream to the user. + this.userOutputStream = this.userCountingOutputStream; + } else { + // Wrap our CountingOutputStream in a compressing OutputStream to + // give to the user. + this.compressor.reset(); + this.userOutputStream = new CompressorStream( + this.userCountingOutputStream, compressor); + } + + return this.userOutputStream; + } + + @Override + /** + * {@inheritDoc} + */ + public java.io.Writer writeClobRecord(long len) throws IOException { + if (!isCharData) { + throw new IOException( + "Can only write CLOB data to a Clob-specific LobFile"); + } + + // Get a binary handle to the record and wrap it in a java.io.Writer. + writeBlobRecord(len); + this.userWriter = new OutputStreamWriter(userOutputStream); + return this.userWriter; + } + } +} diff --git a/src/java/org/apache/sqoop/io/LobReaderCache.java b/src/java/org/apache/sqoop/io/LobReaderCache.java new file mode 100644 index 00000000..bd753740 --- /dev/null +++ b/src/java/org/apache/sqoop/io/LobReaderCache.java @@ -0,0 +1,134 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.sqoop.io; + +import java.io.IOException; +import java.util.Map; +import java.util.TreeMap; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; + +import com.cloudera.sqoop.io.LobFile; + +/** + * A cache of open LobFile.Reader objects. + * This maps from filenames to the open Reader, if any. This uses the + * Singleton pattern. While nothing prevents multiple LobReaderCache + * instances, it is most useful to have a single global cache. This cache is + * internally synchronized; only one thread can insert or retrieve a reader + * from the cache at a time. + */ +public class LobReaderCache { + + public static final Log LOG = + LogFactory.getLog(LobReaderCache.class.getName()); + + private Map readerMap; + + /** + * Open a LobFile for read access, returning a cached reader if one is + * available, or a new reader otherwise. + * @param path the path to the LobFile to open + * @param conf the configuration to use to access the FS. + * @throws IOException if there's an error opening the file. + */ + public LobFile.Reader get(Path path, Configuration conf) + throws IOException { + + LobFile.Reader reader = null; + Path canonicalPath = qualify(path, conf); + // Look up an entry in the cache. + synchronized(this) { + reader = readerMap.remove(canonicalPath); + } + + if (null != reader && !reader.isClosed()) { + // Cache hit. return it. + LOG.debug("Using cached reader for " + canonicalPath); + return reader; + } + + // Cache miss; open the file. + LOG.debug("No cached reader available for " + canonicalPath); + return LobFile.open(path, conf); + } + + /** + * Return a reader back to the cache. If there's already a reader for + * this path, then the current reader is closed. + * @param reader the opened reader. Any record-specific subreaders should be + * closed. + * @throws IOException if there's an error accessing the path's filesystem. + */ + public void recycle(LobFile.Reader reader) throws IOException { + Path canonicalPath = reader.getPath(); + + // Check if the cache has a reader for this path already. If not, add this. + boolean cached = false; + synchronized(this) { + if (readerMap.get(canonicalPath) == null) { + LOG.debug("Caching reader for path: " + canonicalPath); + readerMap.put(canonicalPath, reader); + cached = true; + } + } + + if (!cached) { + LOG.debug("Reader already present for path: " + canonicalPath + + "; closing."); + reader.close(); + } + } + + @Override + protected synchronized void finalize() throws Throwable { + for (LobFile.Reader r : readerMap.values()) { + r.close(); + } + + super.finalize(); + } + + protected LobReaderCache() { + this.readerMap = new TreeMap(); + } + + /** + * Created a fully-qualified path object. + * @param path the path to fully-qualify with its fs URI. + * @param conf the current Hadoop FS configuration. + * @return a new path representing the same location as the input 'path', + * but with a fully-qualified URI. + */ + public static Path qualify(Path path, Configuration conf) + throws IOException { + if (null == path) { + return null; + } + + FileSystem fs = path.getFileSystem(conf); + if (null == fs) { + fs = FileSystem.get(conf); + } + return path.makeQualified(fs); + } +} diff --git a/src/java/org/apache/sqoop/io/NamedFifo.java b/src/java/org/apache/sqoop/io/NamedFifo.java new file mode 100644 index 00000000..98c4b706 --- /dev/null +++ b/src/java/org/apache/sqoop/io/NamedFifo.java @@ -0,0 +1,94 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.sqoop.io; + +import java.io.File; +import java.io.IOException; + +import org.apache.hadoop.util.Shell; +import org.apache.log4j.Logger; + +/** + * A named FIFO channel. + */ +public class NamedFifo { + + private static final Logger LOG = Logger.getLogger(NamedFifo.class); + + private File fifoFile; + + /** Create a named FIFO object at the local fs path given by 'pathname'. */ + public NamedFifo(String pathname) { + this.fifoFile = new File(pathname); + } + + /** Create a named FIFO object at the local fs path given by the 'fifo' File + * object. */ + public NamedFifo(File fifo) { + this.fifoFile = fifo; + } + + /** + * Return the File object representing the FIFO. + */ + public File getFile() { + return this.fifoFile; + } + + /** + * Create a named FIFO object. + * The pipe will be created with permissions 0600. + * @throws IOException on failure. + */ + public void create() throws IOException { + create(0600); + } + + /** + * Create a named FIFO object with the specified fs permissions. + * This depends on the 'mknod' or 'mkfifo' (Mac OS X) system utility + * existing. (for example, provided by Linux coreutils). This object + * will be deleted when the process exits. + * @throws IOException on failure. + */ + public void create(int permissions) throws IOException { + String filename = fifoFile.toString(); + + // Format permissions as a mode string in base 8. + String modeStr = Integer.toString(permissions, 8); + + // Create the FIFO itself. + try { + String output = Shell.execCommand("mknod", "--mode=0" + modeStr, + filename, "p"); + LOG.info("mknod output:\n"+output); + } catch (IOException ex) { + LOG.info("IO error running mknod: " + ex.getMessage()); + LOG.debug("IO error running mknod", ex); + } + if (!this.fifoFile.exists()) { + LOG.info("mknod failed, falling back to mkfifo"); + String output = Shell.execCommand("mkfifo", "-m", "0" + modeStr, + filename); + LOG.info("mkfifo output:\n"+output); + } + + // Schedule the FIFO to be cleaned up when we exit. + this.fifoFile.deleteOnExit(); + } +} diff --git a/src/java/org/apache/sqoop/io/SplittableBufferedWriter.java b/src/java/org/apache/sqoop/io/SplittableBufferedWriter.java new file mode 100644 index 00000000..7cdd398f --- /dev/null +++ b/src/java/org/apache/sqoop/io/SplittableBufferedWriter.java @@ -0,0 +1,72 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.sqoop.io; + +import java.io.BufferedWriter; +import java.io.IOException; +import java.io.OutputStreamWriter; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +/** + * A BufferedWriter implementation that wraps around a SplittingOutputStream + * and allows splitting of the underlying stream. + * Splits occur at allowSplit() calls, or newLine() calls. + */ +public class SplittableBufferedWriter extends BufferedWriter { + + public static final Log LOG = LogFactory.getLog( + SplittableBufferedWriter.class.getName()); + + private SplittingOutputStream splitOutputStream; + private boolean alwaysFlush; + + public SplittableBufferedWriter( + final SplittingOutputStream splitOutputStream) { + super(new OutputStreamWriter(splitOutputStream)); + + this.splitOutputStream = splitOutputStream; + this.alwaysFlush = false; + } + + /** For testing. */ + protected SplittableBufferedWriter( + final SplittingOutputStream splitOutputStream, final boolean alwaysFlush) { + super(new OutputStreamWriter(splitOutputStream)); + + this.splitOutputStream = splitOutputStream; + this.alwaysFlush = alwaysFlush; + } + + public void newLine() throws IOException { + super.newLine(); + this.allowSplit(); + } + + public void allowSplit() throws IOException { + if (alwaysFlush) { + this.flush(); + } + if (this.splitOutputStream.wouldSplit()) { + LOG.debug("Starting new split"); + this.flush(); + this.splitOutputStream.allowSplit(); + } + } +} diff --git a/src/java/org/apache/sqoop/io/SplittingOutputStream.java b/src/java/org/apache/sqoop/io/SplittingOutputStream.java new file mode 100644 index 00000000..60d554f1 --- /dev/null +++ b/src/java/org/apache/sqoop/io/SplittingOutputStream.java @@ -0,0 +1,159 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.sqoop.io; + +import java.io.IOException; +import java.io.OutputStream; +import java.util.Formatter; + +import org.apache.commons.io.output.CountingOutputStream; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.compress.CompressionCodec; + +/** + * An output stream that writes to an underlying filesystem, opening + * a new file after a specified number of bytes have been written to the + * current one. + */ +public class SplittingOutputStream extends OutputStream { + + public static final Log LOG = LogFactory.getLog( + SplittingOutputStream.class.getName()); + + private OutputStream writeStream; + private CountingOutputStream countingFilterStream; + private Configuration conf; + private Path destDir; + private String filePrefix; + private long cutoffBytes; + private CompressionCodec codec; + private int fileNum; + + /** + * Create a new SplittingOutputStream. + * @param conf the Configuration to use to interface with HDFS + * @param destDir the directory where the files will go (should already + * exist). + * @param filePrefix the first part of the filename, which will be appended + * by a number. This file will be placed inside destDir. + * @param cutoff the approximate number of bytes to use per file + * @param doGzip if true, then output files will be gzipped and have a .gz + * suffix. + */ + public SplittingOutputStream(final Configuration conf, final Path destDir, + final String filePrefix, final long cutoff, final CompressionCodec codec) + throws IOException { + + this.conf = conf; + this.destDir = destDir; + this.filePrefix = filePrefix; + this.cutoffBytes = cutoff; + if (this.cutoffBytes < 0) { + this.cutoffBytes = 0; // splitting disabled. + } + this.codec = codec; + this.fileNum = 0; + + openNextFile(); + } + + /** Initialize the OutputStream to the next file to write to. + */ + private void openNextFile() throws IOException { + FileSystem fs = FileSystem.get(conf); + + StringBuffer sb = new StringBuffer(); + Formatter fmt = new Formatter(sb); + fmt.format("%05d", this.fileNum++); + String filename = filePrefix + fmt.toString(); + if (codec != null) { + filename = filename + codec.getDefaultExtension(); + } + Path destFile = new Path(destDir, filename); + LOG.debug("Opening next output file: " + destFile); + if (fs.exists(destFile)) { + Path canonicalDest = destFile.makeQualified(fs); + throw new IOException("Destination file " + canonicalDest + + " already exists"); + } + + OutputStream fsOut = fs.create(destFile); + + // Count how many actual bytes hit HDFS. + this.countingFilterStream = new CountingOutputStream(fsOut); + + if (codec != null) { + // Wrap that in a compressing stream. + this.writeStream = codec.createOutputStream(this.countingFilterStream); + } else { + // Write to the counting stream directly. + this.writeStream = this.countingFilterStream; + } + } + + /** + * @return true if allowSplit() would actually cause a split. + */ + public boolean wouldSplit() { + return this.cutoffBytes > 0 + && this.countingFilterStream.getByteCount() >= this.cutoffBytes; + } + + /** If we've written more to the disk than the user's split size, + * open the next file. + */ + private void checkForNextFile() throws IOException { + if (wouldSplit()) { + LOG.debug("Starting new split"); + this.writeStream.flush(); + this.writeStream.close(); + openNextFile(); + } + } + + /** Defines a point in the stream when it is acceptable to split to a new + file; e.g., the end of a record. + */ + public void allowSplit() throws IOException { + checkForNextFile(); + } + + public void close() throws IOException { + this.writeStream.close(); + } + + public void flush() throws IOException { + this.writeStream.flush(); + } + + public void write(byte [] b) throws IOException { + this.writeStream.write(b); + } + + public void write(byte [] b, int off, int len) throws IOException { + this.writeStream.write(b, off, len); + } + + public void write(int b) throws IOException { + this.writeStream.write(b); + } +} diff --git a/src/java/org/apache/sqoop/io/UnsupportedCodecException.java b/src/java/org/apache/sqoop/io/UnsupportedCodecException.java new file mode 100644 index 00000000..7b4f65cf --- /dev/null +++ b/src/java/org/apache/sqoop/io/UnsupportedCodecException.java @@ -0,0 +1,38 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.sqoop.io; + +import java.io.IOException; + +/** + * Thrown when a compression codec cannot be recognized. + */ +public class UnsupportedCodecException extends IOException { + + public UnsupportedCodecException() { + super("UnsupportedCodecException"); + } + + public UnsupportedCodecException(String msg) { + super(msg); + } + + public UnsupportedCodecException(Throwable cause) { + super(cause); + } +} diff --git a/src/java/org/apache/sqoop/lib/BigDecimalSerializer.java b/src/java/org/apache/sqoop/lib/BigDecimalSerializer.java new file mode 100644 index 00000000..d611af21 --- /dev/null +++ b/src/java/org/apache/sqoop/lib/BigDecimalSerializer.java @@ -0,0 +1,82 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.sqoop.lib; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import java.math.BigDecimal; +import java.math.BigInteger; + +import org.apache.hadoop.io.Text; + +/** + * Serialize BigDecimal classes to/from DataInput and DataOutput objects. + * + * BigDecimal is comprised of a BigInteger with an integer 'scale' field. + * The BigDecimal/BigInteger can also return itself as a 'long' value. + * + * We serialize in one of two formats: + * + * First, check whether the BigInt can fit in a long: + * boolean b = BigIntegerPart > LONG_MAX || BigIntegerPart < LONG_MIN + * + * [int: scale][boolean: b == false][long: BigInt-part] + * [int: scale][boolean: b == true][string: BigInt-part.toString()] + * + * TODO(aaron): Get this to work with Hadoop's Serializations framework. + */ +public final class BigDecimalSerializer { + + private BigDecimalSerializer() { } + + public static final BigInteger LONG_MAX_AS_BIGINT = + BigInteger.valueOf(Long.MAX_VALUE); + public static final BigInteger LONG_MIN_AS_BIGINT = + BigInteger.valueOf(Long.MIN_VALUE); + + public static void write(BigDecimal d, DataOutput out) throws IOException { + int scale = d.scale(); + BigInteger bigIntPart = d.unscaledValue(); + boolean fastpath = bigIntPart.compareTo(LONG_MAX_AS_BIGINT) < 0 + && bigIntPart .compareTo(LONG_MIN_AS_BIGINT) > 0; + + out.writeInt(scale); + out.writeBoolean(fastpath); + if (fastpath) { + out.writeLong(bigIntPart.longValue()); + } else { + Text.writeString(out, bigIntPart.toString()); + } + } + + public static BigDecimal readFields(DataInput in) throws IOException { + int scale = in.readInt(); + boolean fastpath = in.readBoolean(); + BigInteger unscaledIntPart; + if (fastpath) { + long unscaledValue = in.readLong(); + unscaledIntPart = BigInteger.valueOf(unscaledValue); + } else { + String unscaledValueStr = Text.readString(in); + unscaledIntPart = new BigInteger(unscaledValueStr); + } + + return new BigDecimal(unscaledIntPart, scale); + } +} diff --git a/src/java/org/apache/sqoop/lib/BlobRef.java b/src/java/org/apache/sqoop/lib/BlobRef.java new file mode 100644 index 00000000..bff6b719 --- /dev/null +++ b/src/java/org/apache/sqoop/lib/BlobRef.java @@ -0,0 +1,130 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.sqoop.lib; + +import java.io.ByteArrayInputStream; +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import java.io.InputStream; +import java.util.Arrays; +import java.util.regex.Matcher; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.io.BytesWritable; + +import com.cloudera.sqoop.io.LobFile; + +/** + * BlobRef is a wrapper that holds a BLOB either directly, or a + * reference to a file that holds the BLOB data. + */ +public class BlobRef extends + com.cloudera.sqoop.lib.LobRef { + + public static final Log LOG = LogFactory.getLog(BlobRef.class.getName()); + + public BlobRef() { + super(); + } + + public BlobRef(byte [] bytes) { + super(new BytesWritable(bytes)); + } + + /** + * Initialize a BlobRef to an external BLOB. + * @param file the filename to the BLOB. May be relative to the job dir. + * @param offset the offset (in bytes) into the LobFile for this record. + * @param length the length of the record in bytes. + */ + public BlobRef(String file, long offset, long length) { + super(file, offset, length); + } + + @Override + protected InputStream getExternalSource(LobFile.Reader reader) + throws IOException { + return reader.readBlobRecord(); + } + + @Override + protected InputStream getInternalSource(BytesWritable data) { + return new ByteArrayInputStream(data.getBytes(), 0, data.getLength()); + } + + @Override + protected byte [] getInternalData(BytesWritable data) { + return Arrays.copyOf(data.getBytes(), data.getLength()); + } + + @Override + protected BytesWritable deepCopyData(BytesWritable data) { + return new BytesWritable(Arrays.copyOf(data.getBytes(), data.getLength())); + } + + @Override + public void readFieldsInternal(DataInput in) throws IOException { + // For internally-stored BLOBs, the data is a BytesWritable + // containing the actual data. + + BytesWritable data = getDataObj(); + + if (null == data) { + data = new BytesWritable(); + } + data.readFields(in); + setDataObj(data); + } + + @Override + public void writeInternal(DataOutput out) throws IOException { + getDataObj().write(out); + } + + /** + * Create a BlobRef based on parsed data from a line of text. + * This only operates correctly on external blobs; inline blobs are simply + * returned as null. You should store BLOB data in SequenceFile format + * if reparsing is necessary. + * @param inputString the text-based input data to parse. + * @return a new BlobRef containing a reference to an external BLOB, or + * an empty BlobRef if the data to be parsed is actually inline. + */ + public static com.cloudera.sqoop.lib.BlobRef parse(String inputString) { + // If inputString is of the form 'externalLob(lf,%s,%d,%d)', then this is + // an external BLOB stored at the LobFile indicated by '%s' with the next + // two arguments representing its offset and length in the file. + // Otherwise, it is an inline BLOB, which we don't support parsing of. + + Matcher m = org.apache.sqoop.lib.LobRef.EXTERNAL_MATCHER.get(); + m.reset(inputString); + if (m.matches()) { + // This is a LobFile. Extract the filename, offset and len from the + // matcher. + return new com.cloudera.sqoop.lib.BlobRef(m.group(1), + Long.valueOf(m.group(2)), Long.valueOf(m.group(3))); + } else { + // This is inline BLOB string data. + LOG.warn( + "Reparsing inline BLOB data is not supported; use SequenceFiles."); + return new com.cloudera.sqoop.lib.BlobRef(); + } + } +} diff --git a/src/java/org/apache/sqoop/lib/BooleanParser.java b/src/java/org/apache/sqoop/lib/BooleanParser.java new file mode 100644 index 00000000..cd592262 --- /dev/null +++ b/src/java/org/apache/sqoop/lib/BooleanParser.java @@ -0,0 +1,41 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.sqoop.lib; + +/** + * Parse string representations of boolean values into boolean + * scalar types. + */ +public final class BooleanParser { + + /** + * Return a boolean based on the value contained in the string. + * + *

The following values are considered true: + * "true", "t", "yes", "on", "1".

+ *

All other values, including 'null', are false.

+ *

All comparisons are case-insensitive.

+ */ + public static boolean valueOf(final String s) { + return s != null && ("true".equalsIgnoreCase(s) || "t".equalsIgnoreCase(s) + || "1".equals(s) || "on".equalsIgnoreCase(s) + || "yes".equalsIgnoreCase(s)); + } + + private BooleanParser() { } +} diff --git a/src/java/org/apache/sqoop/lib/ClobRef.java b/src/java/org/apache/sqoop/lib/ClobRef.java new file mode 100644 index 00000000..5f01b04b --- /dev/null +++ b/src/java/org/apache/sqoop/lib/ClobRef.java @@ -0,0 +1,113 @@ +/** + * Copyright 2011 The Apache Software Foundation + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.sqoop.lib; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; +import java.util.regex.Matcher; + +import org.apache.hadoop.io.Text; + +import com.cloudera.sqoop.io.LobFile; + +/** + * ClobRef is a wrapper that holds a CLOB either directly, or a + * reference to a file that holds the CLOB data. + */ +public class ClobRef + extends com.cloudera.sqoop.lib.LobRef { + + public ClobRef() { + super(); + } + + public ClobRef(String chars) { + super(chars); + } + + /** + * Initialize a clobref to an external CLOB. + * @param file the filename to the CLOB. May be relative to the job dir. + * @param offset the offset (in bytes) into the LobFile for this record. + * @param length the length of the record in characters. + */ + public ClobRef(String file, long offset, long length) { + super(file, offset, length); + } + + @Override + protected Reader getExternalSource(LobFile.Reader reader) + throws IOException { + return reader.readClobRecord(); + } + + @Override + protected Reader getInternalSource(String data) { + return new StringReader(data); + } + + @Override + protected String deepCopyData(String data) { + return data; + } + + @Override + protected String getInternalData(String data) { + return data; + } + + @Override + public void readFieldsInternal(DataInput in) throws IOException { + // For internally-stored clobs, the data is written as UTF8 Text. + setDataObj(Text.readString(in)); + } + + @Override + public void writeInternal(DataOutput out) throws IOException { + Text.writeString(out, getDataObj()); + } + + /** + * Create a ClobRef based on parsed data from a line of text. + * @param inputString the text-based input data to parse. + * @return a ClobRef to the given data. + */ + public static com.cloudera.sqoop.lib.ClobRef parse(String inputString) { + // If inputString is of the form 'externalLob(lf,%s,%d,%d)', then this is + // an external CLOB stored at the LobFile indicated by '%s' with the next + // two arguments representing its offset and length in the file. + // Otherwise, it is an inline CLOB, which we read as-is. + + Matcher m = EXTERNAL_MATCHER.get(); + m.reset(inputString); + if (m.matches()) { + // This is a LobFile. Extract the filename, offset and len from the + // matcher. + return new com.cloudera.sqoop.lib.ClobRef(m.group(1), + Long.valueOf(m.group(2)), Long.valueOf(m.group(3))); + } else { + // This is inline CLOB string data. + return new com.cloudera.sqoop.lib.ClobRef(inputString); + } + } +} diff --git a/src/java/org/apache/sqoop/lib/DelimiterSet.java b/src/java/org/apache/sqoop/lib/DelimiterSet.java new file mode 100644 index 00000000..c79ecfd1 --- /dev/null +++ b/src/java/org/apache/sqoop/lib/DelimiterSet.java @@ -0,0 +1,205 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.sqoop.lib; + + +/** + * Encapsulates a set of delimiters used to encode a record. + */ +public class DelimiterSet implements Cloneable { + + public static final char NULL_CHAR = '\000'; + + private char fieldDelim; // fields terminated by this. + private char recordDelim; // records terminated by this. + + // If these next two fields are '\000', then they are ignored. + private char enclosedBy; + private char escapedBy; + + // If true, then the enclosed-by character is applied to every + // field, not just ones containing embedded delimiters. + private boolean encloseRequired; + + /** + * Create a delimiter set with the default delimiters + * (comma for fields, newline for records). + */ + public DelimiterSet() { + this(',', '\n', NULL_CHAR, NULL_CHAR, false); + } + + /** + * Create a delimiter set with the specified delimiters. + * @param field the fields-terminated-by delimiter + * @param record the lines-terminated-by delimiter + * @param enclose the enclosed-by character + * @param escape the escaped-by character + * @param isEncloseRequired If true, enclosed-by is applied to all + * fields. If false, only applied to fields that embed delimiters. + */ + public DelimiterSet(char field, char record, char enclose, char escape, + boolean isEncloseRequired) { + this.fieldDelim = field; + this.recordDelim = record; + this.enclosedBy = enclose; + this.escapedBy = escape; + this.encloseRequired = isEncloseRequired; + } + + /** + * Sets the fields-terminated-by character. + */ + public void setFieldsTerminatedBy(char f) { + this.fieldDelim = f; + } + + /** + * @return the fields-terminated-by character. + */ + public char getFieldsTerminatedBy() { + return this.fieldDelim; + } + + /** + * Sets the end-of-record lines-terminated-by character. + */ + public void setLinesTerminatedBy(char r) { + this.recordDelim = r; + } + + /** + * @return the end-of-record (lines-terminated-by) character. + */ + public char getLinesTerminatedBy() { + return this.recordDelim; + } + + /** + * Sets the enclosed-by character. + * @param e the enclosed-by character, or '\000' for no enclosing character. + */ + public void setEnclosedBy(char e) { + this.enclosedBy = e; + } + + /** + * @return the enclosed-by character, or '\000' for none. + */ + public char getEnclosedBy() { + return this.enclosedBy; + } + + /** + * Sets the escaped-by character. + * @param e the escaped-by character, or '\000' for no escape character. + */ + public void setEscapedBy(char e) { + this.escapedBy = e; + } + + /** + * @return the escaped-by character, or '\000' for none. + */ + public char getEscapedBy() { + return this.escapedBy; + } + + /** + * Set whether the enclosed-by character must be applied to all fields, + * or only fields with embedded delimiters. + */ + public void setEncloseRequired(boolean required) { + this.encloseRequired = required; + } + + /** + * @return true if the enclosed-by character must be applied to all fields, + * or false if it's only used for fields with embedded delimiters. + */ + public boolean isEncloseRequired() { + return this.encloseRequired; + } + + @Override + /** + * @return a string representation of the delimiters. + */ + public String toString() { + return "fields=" + this.fieldDelim + + " records=" + this.recordDelim + + " escape=" + this.escapedBy + + " enclose=" + this.enclosedBy + + " required=" + this.encloseRequired; + } + + /** + * Format this set of delimiters as a call to the constructor for + * this object, that would generate identical delimiters. + * @return a String that can be embedded in generated code that + * provides this set of delimiters. + */ + public String formatConstructor() { + return "new DelimiterSet((char) " + (int) this.fieldDelim + ", " + + "(char) " + (int) this.recordDelim + ", " + + "(char) " + (int) this.enclosedBy + ", " + + "(char) " + (int) this.escapedBy + ", " + + this.encloseRequired + ")"; + } + + @Override + /** + * @return a hash code for this set of delimiters. + */ + public int hashCode() { + return (int) this.fieldDelim + + (((int) this.recordDelim) << 4) + + (((int) this.escapedBy) << 8) + + (((int) this.enclosedBy) << 12) + + (((int) this.recordDelim) << 16) + + (this.encloseRequired ? 0xFEFE : 0x7070); + } + + @Override + /** + * @return true if this delimiter set is the same as another set of + * delimiters. + */ + public boolean equals(Object other) { + if (null == other) { + return false; + } else if (!other.getClass().equals(getClass())) { + return false; + } + + DelimiterSet set = (DelimiterSet) other; + return this.fieldDelim == set.fieldDelim + && this.recordDelim == set.recordDelim + && this.escapedBy == set.escapedBy + && this.enclosedBy == set.enclosedBy + && this.encloseRequired == set.encloseRequired; + } + + @Override + /** + * @return a new copy of this same set of delimiters. + */ + public Object clone() throws CloneNotSupportedException { + return super.clone(); + } +} diff --git a/src/java/org/apache/sqoop/lib/FieldFormatter.java b/src/java/org/apache/sqoop/lib/FieldFormatter.java new file mode 100644 index 00000000..eaea59f0 --- /dev/null +++ b/src/java/org/apache/sqoop/lib/FieldFormatter.java @@ -0,0 +1,139 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.sqoop.lib; + +/** + * Static helper class that will help format data with quotes and escape chars. + */ +public final class FieldFormatter { + + /** + * only pass fields that are strings when --hive-drop-delims option is on. + * @param str + * @param delimiters + * @return + */ + public static String hiveStringDropDelims(String str, + com.cloudera.sqoop.lib.DelimiterSet delimiters) { + return hiveStringReplaceDelims(str, "", delimiters); + } + + /** + * replace hive delimiters with a user-defined string passed to the + * --hive-delims-replacement option. + * @param str + * @param delimiters + * @return + */ + public static String hiveStringReplaceDelims(String str, String replacement, + com.cloudera.sqoop.lib.DelimiterSet delimiters) { + String droppedDelims = str.replaceAll("\\n|\\r|\01", replacement); + return escapeAndEnclose(droppedDelims, delimiters); + } + + /** + * Takes an input string representing the value of a field, encloses it in + * enclosing chars, and escapes any occurrences of such characters in the + * middle. The escape character itself is also escaped if it appears in the + * text of the field. If there is no enclosing character, then any + * delimiters present in the field body are escaped instead. + * + * The field is enclosed only if: + * enclose != '\000', and: + * encloseRequired is true, or + * one of the fields-terminated-by or lines-terminated-by characters is + * present in the string. + * + * Escaping is not performed if the escape char is '\000'. + * + * @param str - The user's string to escape and enclose + * @param delimiters - The DelimiterSet to use identifying the escape and + * enclose semantics. If the specified escape or enclose characters are + * '\000', those operations are not performed. + * @return the escaped, enclosed version of 'str'. + */ + public static String escapeAndEnclose(String str, + com.cloudera.sqoop.lib.DelimiterSet delimiters) { + + char escape = delimiters.getEscapedBy(); + char enclose = delimiters.getEnclosedBy(); + boolean encloseRequired = delimiters.isEncloseRequired(); + + // true if we can use an escape character. + boolean escapingLegal = + com.cloudera.sqoop.lib.DelimiterSet.NULL_CHAR != escape; + String withEscapes; + + if (null == str) { + return null; + } + + if (escapingLegal) { + // escaping is legal. Escape any instances of the escape char itself. + withEscapes = str.replace("" + escape, "" + escape + escape); + } else { + // no need to double-escape + withEscapes = str; + } + + if (com.cloudera.sqoop.lib.DelimiterSet.NULL_CHAR == enclose) { + // The enclose-with character was left unset, so we can't enclose items. + + if (escapingLegal) { + // If the user has used the fields-terminated-by or + // lines-terminated-by characters in the string, escape them if we + // have an escape character. + String fields = "" + delimiters.getFieldsTerminatedBy(); + String lines = "" + delimiters.getLinesTerminatedBy(); + withEscapes = withEscapes.replace(fields, "" + escape + fields); + withEscapes = withEscapes.replace(lines, "" + escape + lines); + } + + // No enclosing possible, so now return this. + return withEscapes; + } + + // if we have an enclosing character, and escaping is legal, then the + // encloser must always be escaped. + if (escapingLegal) { + withEscapes = withEscapes.replace("" + enclose, "" + escape + enclose); + } + + boolean actuallyDoEnclose = encloseRequired; + if (!actuallyDoEnclose) { + // check if the string requires enclosing. + char [] mustEncloseFor = new char[2]; + mustEncloseFor[0] = delimiters.getFieldsTerminatedBy(); + mustEncloseFor[1] = delimiters.getLinesTerminatedBy(); + for (char reason : mustEncloseFor) { + if (str.indexOf(reason) != -1) { + actuallyDoEnclose = true; + break; + } + } + } + + if (actuallyDoEnclose) { + return "" + enclose + withEscapes + enclose; + } else { + return withEscapes; + } + } + + private FieldFormatter() { } +} diff --git a/src/java/org/apache/sqoop/lib/FieldMapProcessor.java b/src/java/org/apache/sqoop/lib/FieldMapProcessor.java new file mode 100644 index 00000000..6a4ade91 --- /dev/null +++ b/src/java/org/apache/sqoop/lib/FieldMapProcessor.java @@ -0,0 +1,39 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.sqoop.lib; + +import java.io.IOException; + +import com.cloudera.sqoop.lib.FieldMappable; +import com.cloudera.sqoop.lib.ProcessingException; + +/** + * Interface implemented by classes that process FieldMappable objects. + */ +public interface FieldMapProcessor { + + /** + * Allow arbitrary processing of a FieldMappable object. + * @param record an object which can emit a map of its field names to values. + * @throws IOException if the processor encounters an IO error when + * operating on this object. + * @throws ProcessingException if the FieldMapProcessor encounters + * a general processing error when operating on this object. + */ + void accept(FieldMappable record) throws IOException, ProcessingException; +} diff --git a/src/java/org/apache/sqoop/lib/FieldMappable.java b/src/java/org/apache/sqoop/lib/FieldMappable.java new file mode 100644 index 00000000..61d499bc --- /dev/null +++ b/src/java/org/apache/sqoop/lib/FieldMappable.java @@ -0,0 +1,34 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.sqoop.lib; + +import java.util.Map; + +/** + * Interface describing a class capable of returning a map of the fields + * of the object to their values. + */ +public interface FieldMappable { + + /** + * Returns a map containing all fields of this record. + * @return a map from column names to the object-based values for + * this record. The map may not be null, though it may be empty. + */ + Map getFieldMap(); +} diff --git a/src/java/org/apache/sqoop/lib/JdbcWritableBridge.java b/src/java/org/apache/sqoop/lib/JdbcWritableBridge.java new file mode 100644 index 00000000..afde5855 --- /dev/null +++ b/src/java/org/apache/sqoop/lib/JdbcWritableBridge.java @@ -0,0 +1,256 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.sqoop.lib; + +import java.math.BigDecimal; +import java.sql.Date; +import java.sql.PreparedStatement; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.sql.Time; +import java.sql.Timestamp; + +import org.apache.hadoop.io.BytesWritable; + +import com.cloudera.sqoop.lib.BlobRef; +import com.cloudera.sqoop.lib.ClobRef; + +/** + * Contains a set of methods which can read db columns from a ResultSet into + * Java types, and do serialization of these types to/from DataInput/DataOutput + * for use with Hadoop's Writable implementation. This supports null values + * for all types. + */ +public final class JdbcWritableBridge { + + // Currently, cap BLOB/CLOB objects at 16 MB until we can use external + // storage. + public static final long MAX_BLOB_LENGTH = 16 * 1024 * 1024; + public static final long MAX_CLOB_LENGTH = 16 * 1024 * 1024; + + private JdbcWritableBridge() { + } + + public static Integer readInteger(int colNum, ResultSet r) + throws SQLException { + int val; + val = r.getInt(colNum); + if (r.wasNull()) { + return null; + } else { + return Integer.valueOf(val); + } + } + + public static Long readLong(int colNum, ResultSet r) throws SQLException { + long val; + val = r.getLong(colNum); + if (r.wasNull()) { + return null; + } else { + return Long.valueOf(val); + } + } + + public static String readString(int colNum, ResultSet r) throws SQLException { + return r.getString(colNum); + } + + public static Float readFloat(int colNum, ResultSet r) throws SQLException { + float val; + val = r.getFloat(colNum); + if (r.wasNull()) { + return null; + } else { + return Float.valueOf(val); + } + } + + public static Double readDouble(int colNum, ResultSet r) throws SQLException { + double val; + val = r.getDouble(colNum); + if (r.wasNull()) { + return null; + } else { + return Double.valueOf(val); + } + } + + public static Boolean readBoolean(int colNum, ResultSet r) + throws SQLException { + boolean val; + val = r.getBoolean(colNum); + if (r.wasNull()) { + return null; + } else { + return Boolean.valueOf(val); + } + } + + public static Time readTime(int colNum, ResultSet r) throws SQLException { + return r.getTime(colNum); + } + + public static Timestamp readTimestamp(int colNum, ResultSet r) + throws SQLException { + return r.getTimestamp(colNum); + } + + public static Date readDate(int colNum, ResultSet r) throws SQLException { + return r.getDate(colNum); + } + + public static BytesWritable readBytesWritable(int colNum, ResultSet r) + throws SQLException { + byte [] bytes = r.getBytes(colNum); + return bytes == null ? null : new BytesWritable(bytes); + } + + public static BigDecimal readBigDecimal(int colNum, ResultSet r) + throws SQLException { + return r.getBigDecimal(colNum); + } + + public static BlobRef readBlobRef(int colNum, ResultSet r) + throws SQLException { + // Loading of BLOBs is delayed; handled by LargeObjectLoader. + return null; + } + + public static ClobRef readClobRef(int colNum, ResultSet r) + throws SQLException { + // Loading of CLOBs is delayed; handled by LargeObjectLoader. + return null; + } + + public static void writeInteger(Integer val, int paramIdx, int sqlType, + PreparedStatement s) throws SQLException { + if (null == val) { + s.setNull(paramIdx, sqlType); + } else { + s.setInt(paramIdx, val); + } + } + + public static void writeLong(Long val, int paramIdx, int sqlType, + PreparedStatement s) throws SQLException { + if (null == val) { + s.setNull(paramIdx, sqlType); + } else { + s.setLong(paramIdx, val); + } + } + + public static void writeDouble(Double val, int paramIdx, int sqlType, + PreparedStatement s) throws SQLException { + if (null == val) { + s.setNull(paramIdx, sqlType); + } else { + s.setDouble(paramIdx, val); + } + } + + public static void writeBoolean(Boolean val, int paramIdx, int sqlType, + PreparedStatement s) throws SQLException { + if (null == val) { + s.setNull(paramIdx, sqlType); + } else { + s.setBoolean(paramIdx, val); + } + } + + public static void writeFloat(Float val, int paramIdx, int sqlType, + PreparedStatement s) throws SQLException { + if (null == val) { + s.setNull(paramIdx, sqlType); + } else { + s.setFloat(paramIdx, val); + } + } + + public static void writeString(String val, int paramIdx, int sqlType, + PreparedStatement s) throws SQLException { + if (null == val) { + s.setNull(paramIdx, sqlType); + } else { + s.setString(paramIdx, val); + } + } + + public static void writeTimestamp(Timestamp val, int paramIdx, int sqlType, + PreparedStatement s) throws SQLException { + if (null == val) { + s.setNull(paramIdx, sqlType); + } else { + s.setTimestamp(paramIdx, val); + } + } + + public static void writeTime(Time val, int paramIdx, int sqlType, + PreparedStatement s) throws SQLException { + if (null == val) { + s.setNull(paramIdx, sqlType); + } else { + s.setTime(paramIdx, val); + } + } + + public static void writeDate(Date val, int paramIdx, int sqlType, + PreparedStatement s) throws SQLException { + if (null == val) { + s.setNull(paramIdx, sqlType); + } else { + s.setDate(paramIdx, val); + } + } + + public static void writeBytesWritable(BytesWritable val, int paramIdx, + int sqlType, PreparedStatement s) throws SQLException { + if (null == val) { + s.setNull(paramIdx, sqlType); + } else { + // val.getBytes() is only valid in [0, len) + byte [] rawBytes = val.getBytes(); + int len = val.getLength(); + byte [] outBytes = new byte[len]; + System.arraycopy(rawBytes, 0, outBytes, 0, len); + s.setBytes(paramIdx, outBytes); + } + } + + public static void writeBigDecimal(BigDecimal val, int paramIdx, + int sqlType, PreparedStatement s) throws SQLException { + if (null == val) { + s.setNull(paramIdx, sqlType); + } else { + s.setBigDecimal(paramIdx, val); + } + } + + public static void writeBlobRef(com.cloudera.sqoop.lib.BlobRef val, + int paramIdx, int sqlType, PreparedStatement s) throws SQLException { + // TODO: support this. + throw new RuntimeException("Unsupported: Cannot export BLOB data"); + } + + public static void writeClobRef(com.cloudera.sqoop.lib.ClobRef val, + int paramIdx, int sqlType, PreparedStatement s) throws SQLException { + // TODO: support this. + throw new RuntimeException("Unsupported: Cannot export CLOB data"); + } +} diff --git a/src/java/org/apache/sqoop/lib/LargeObjectLoader.java b/src/java/org/apache/sqoop/lib/LargeObjectLoader.java new file mode 100644 index 00000000..bc51277a --- /dev/null +++ b/src/java/org/apache/sqoop/lib/LargeObjectLoader.java @@ -0,0 +1,322 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.sqoop.lib; + +import java.io.Closeable; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.io.Reader; +import java.io.Writer; +import java.sql.Blob; +import java.sql.Clob; +import java.sql.ResultSet; +import java.sql.SQLException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; + +import com.cloudera.sqoop.io.LobFile; +import com.cloudera.sqoop.util.TaskId; + +/** + * Contains a set of methods which can read db columns from a ResultSet into + * Java types, and do serialization of these types to/from DataInput/DataOutput + * for use with Hadoop's Writable implementation. This supports null values + * for all types. + * + * This is a singleton instance class; only one may exist at a time. + * However, its lifetime is limited to the current TaskInputOutputContext's + * life. + */ +public class LargeObjectLoader implements Closeable { + + // Spill to external storage for BLOB/CLOB objects > 16 MB. + public static final long DEFAULT_MAX_LOB_LENGTH = 16 * 1024 * 1024; + + public static final String MAX_INLINE_LOB_LEN_KEY = + "sqoop.inline.lob.length.max"; + + private Configuration conf; + private Path workPath; + private FileSystem fs; + + // Handles to the open BLOB / CLOB file writers. + private LobFile.Writer curBlobWriter; + private LobFile.Writer curClobWriter; + + // Counter that is used with the current task attempt id to + // generate unique LOB file names. + private long nextLobFileId = 0; + + /** + * Create a new LargeObjectLoader. + * @param conf the Configuration to use + * @param workPath the HDFS working directory for this task. + */ + public LargeObjectLoader(Configuration conf, Path workPath) + throws IOException { + this.conf = conf; + this.workPath = workPath; + this.fs = FileSystem.get(conf); + this.curBlobWriter = null; + this.curClobWriter = null; + } + + @Override + protected synchronized void finalize() throws Throwable { + close(); + super.finalize(); + } + + @Override + public void close() throws IOException { + if (null != curBlobWriter) { + curBlobWriter.close(); + curBlobWriter = null; + } + + if (null != curClobWriter) { + curClobWriter.close(); + curClobWriter = null; + } + } + + /** + * @return a filename to use to put an external LOB in. + */ + private String getNextLobFileName() { + String file = "_lob/large_obj_" + TaskId.get(conf, "unknown_task_id") + + nextLobFileId + ".lob"; + nextLobFileId++; + + return file; + } + + /** + * Calculates a path to a new LobFile object, creating any + * missing directories. + * @return a Path to a LobFile to write + */ + private Path getNextLobFilePath() throws IOException { + Path p = new Path(workPath, getNextLobFileName()); + Path parent = p.getParent(); + if (!fs.exists(parent)) { + fs.mkdirs(parent); + } + + return p; + } + + /** + * @return the current LobFile writer for BLOBs, creating one if necessary. + */ + private LobFile.Writer getBlobWriter() throws IOException { + if (null == this.curBlobWriter) { + this.curBlobWriter = LobFile.create(getNextLobFilePath(), conf, false); + } + + return this.curBlobWriter; + } + + /** + * @return the current LobFile writer for CLOBs, creating one if necessary. + */ + private LobFile.Writer getClobWriter() throws IOException { + if (null == this.curClobWriter) { + this.curClobWriter = LobFile.create(getNextLobFilePath(), conf, true); + } + + return this.curClobWriter; + } + + /** + * Returns the path being written to by a given LobFile.Writer, relative + * to the working directory of this LargeObjectLoader. + * @param w the LobFile.Writer whose path should be examined. + * @return the path this is writing to, relative to the current working dir. + */ + private String getRelativePath(LobFile.Writer w) { + Path writerPath = w.getPath(); + + String writerPathStr = writerPath.toString(); + String workPathStr = workPath.toString(); + if (!workPathStr.endsWith(File.separator)) { + workPathStr = workPathStr + File.separator; + } + + if (writerPathStr.startsWith(workPathStr)) { + return writerPathStr.substring(workPathStr.length()); + } + + // Outside the working dir; return the whole thing. + return writerPathStr; + } + + /** + * Copies all character data from the provided Reader to the provided + * Writer. Does not close handles when it's done. + * @param reader data source + * @param writer data sink + * @throws IOException if an I/O error occurs either reading or writing. + */ + private void copyAll(Reader reader, Writer writer) throws IOException { + int bufferSize = conf.getInt("io.file.buffer.size", + 4096); + char [] buf = new char[bufferSize]; + + while (true) { + int charsRead = reader.read(buf); + if (-1 == charsRead) { + break; // no more stream to read. + } + writer.write(buf, 0, charsRead); + } + } + + /** + * Copies all byte data from the provided InputStream to the provided + * OutputStream. Does not close handles when it's done. + * @param input data source + * @param output data sink + * @throws IOException if an I/O error occurs either reading or writing. + */ + private void copyAll(InputStream input, OutputStream output) + throws IOException { + int bufferSize = conf.getInt("io.file.buffer.size", + 4096); + byte [] buf = new byte[bufferSize]; + + while (true) { + int bytesRead = input.read(buf, 0, bufferSize); + if (-1 == bytesRead) { + break; // no more stream to read. + } + output.write(buf, 0, bytesRead); + } + } + + /** + * Actually read a BlobRef instance from the ResultSet and materialize + * the data either inline or to a file. + * + * @param colNum the column of the ResultSet's current row to read. + * @param r the ResultSet to read from. + * @return a BlobRef encapsulating the data in this field. + * @throws IOException if an error occurs writing to the FileSystem. + * @throws SQLException if an error occurs reading from the database. + */ + public com.cloudera.sqoop.lib.BlobRef readBlobRef(int colNum, ResultSet r) + throws IOException, InterruptedException, SQLException { + + long maxInlineLobLen = conf.getLong( + MAX_INLINE_LOB_LEN_KEY, + DEFAULT_MAX_LOB_LENGTH); + + Blob b = r.getBlob(colNum); + if (null == b) { + return null; + } else if (b.length() > maxInlineLobLen) { + // Deserialize very large BLOBs into separate files. + long len = b.length(); + LobFile.Writer lobWriter = getBlobWriter(); + + long recordOffset = lobWriter.tell(); + InputStream is = null; + OutputStream os = lobWriter.writeBlobRecord(len); + try { + is = b.getBinaryStream(); + copyAll(is, os); + } finally { + if (null != os) { + os.close(); + } + + if (null != is) { + is.close(); + } + + // Mark the record as finished. + lobWriter.finishRecord(); + } + + return new com.cloudera.sqoop.lib.BlobRef( + getRelativePath(curBlobWriter), recordOffset, len); + } else { + // This is a 1-based array. + return new com.cloudera.sqoop.lib.BlobRef( + b.getBytes(1, (int) b.length())); + } + } + + + /** + * Actually read a ClobRef instance from the ResultSet and materialize + * the data either inline or to a file. + * + * @param colNum the column of the ResultSet's current row to read. + * @param r the ResultSet to read from. + * @return a ClobRef encapsulating the data in this field. + * @throws IOException if an error occurs writing to the FileSystem. + * @throws SQLException if an error occurs reading from the database. + */ + public com.cloudera.sqoop.lib.ClobRef readClobRef(int colNum, ResultSet r) + throws IOException, InterruptedException, SQLException { + + long maxInlineLobLen = conf.getLong( + MAX_INLINE_LOB_LEN_KEY, + DEFAULT_MAX_LOB_LENGTH); + + Clob c = r.getClob(colNum); + if (null == c) { + return null; + } else if (c.length() > maxInlineLobLen) { + // Deserialize large CLOB into separate file. + long len = c.length(); + LobFile.Writer lobWriter = getClobWriter(); + + long recordOffset = lobWriter.tell(); + Reader reader = null; + Writer w = lobWriter.writeClobRecord(len); + try { + reader = c.getCharacterStream(); + copyAll(reader, w); + } finally { + if (null != w) { + w.close(); + } + + if (null != reader) { + reader.close(); + } + + // Mark the record as finished. + lobWriter.finishRecord(); + } + + return new com.cloudera.sqoop.lib.ClobRef( + getRelativePath(lobWriter), recordOffset, len); + } else { + // This is a 1-based array. + return new com.cloudera.sqoop.lib.ClobRef( + c.getSubString(1, (int) c.length())); + } + } +} diff --git a/src/java/org/apache/sqoop/lib/LobRef.java b/src/java/org/apache/sqoop/lib/LobRef.java new file mode 100644 index 00000000..d6d6b253 --- /dev/null +++ b/src/java/org/apache/sqoop/lib/LobRef.java @@ -0,0 +1,329 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.sqoop.lib; + +import java.io.Closeable; +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.mapreduce.InputSplit; +import org.apache.hadoop.mapreduce.Mapper; +import org.apache.hadoop.mapreduce.lib.input.FileSplit; + +import com.cloudera.sqoop.io.LobFile; +import com.cloudera.sqoop.io.LobReaderCache; + +/** + * Abstract base class that holds a reference to a Blob or a Clob. + * DATATYPE is the type being held (e.g., a byte array). + * CONTAINERTYPE is the type used to hold this data (e.g., BytesWritable). + * ACCESSORTYPE is the type used to access this data in a streaming fashion + * (either an InputStream or a Reader). + */ +public abstract class LobRef + implements Closeable, Writable { + + public static final Log LOG = LogFactory.getLog(LobRef.class.getName()); + + protected LobRef() { + this.fileName = null; + this.offset = 0; + this.length = 0; + + this.realData = null; + } + + protected LobRef(CONTAINERTYPE container) { + this.fileName = null; + this.offset = 0; + this.length = 0; + + this.realData = container; + } + + protected LobRef(String file, long offset, long length) { + this.fileName = file; + this.offset = offset; + this.length = length; + + this.realData = null; + } + + // If the data is 'small', it's held directly, here. + private CONTAINERTYPE realData; + + /** Internal API to retrieve the data object. */ + protected CONTAINERTYPE getDataObj() { + return realData; + } + + /** Internal API to set the data object. */ + protected void setDataObj(CONTAINERTYPE data) { + this.realData = data; + } + + // If there data is too large to materialize fully, it's written into a file + // whose path (relative to the rest of the dataset) is recorded here. This + // takes precedence if the value fof fileName is non-null. These records are + // currently written into LobFile-formatted files, which hold multiple + // records. The starting offset and length of the record are recorded here + // as well. + private String fileName; + private long offset; + private long length; + + // If we've opened a LobFile object, track our reference to it here. + private LobFile.Reader lobReader; + + @Override + @SuppressWarnings("unchecked") + /** + * Clone the current reference object. data is deep-copied; any open + * file handle remains with the original only. + */ + public Object clone() throws CloneNotSupportedException { + LobRef r = + (LobRef) super.clone(); + + r.lobReader = null; // Reference to opened reader is not duplicated. + if (null != realData) { + r.realData = deepCopyData(realData); + } + + return r; + } + + @Override + protected synchronized void finalize() throws Throwable { + close(); + super.finalize(); + } + + public void close() throws IOException { + // Discard any open LobReader. + if (null != this.lobReader) { + LobReaderCache.getCache().recycle(this.lobReader); + } + } + + /** + * @return true if the LOB data is in an external file; false if + * it materialized inline. + */ + public boolean isExternal() { + return fileName != null; + } + + /** + * Convenience method to access #getDataStream(Configuration, Path) + * from within a map task that read this LobRef from a file-based + * InputSplit. + * @param mapContext the Mapper.Context instance that encapsulates + * the current map task. + * @return an object that lazily streams the record to the client. + * @throws IllegalArgumentException if it cannot find the source + * path for this LOB based on the MapContext. + * @throws IOException if it could not read the LOB from external storage. + */ + public ACCESSORTYPE getDataStream(Mapper.Context mapContext) + throws IOException { + InputSplit split = mapContext.getInputSplit(); + if (split instanceof FileSplit) { + Path basePath = ((FileSplit) split).getPath().getParent(); + return getDataStream(mapContext.getConfiguration(), + basePath); + } else { + throw new IllegalArgumentException( + "Could not ascertain LOB base path from MapContext."); + } + } + + /** + * Get access to the LOB data itself. + * This method returns a lazy reader of the LOB data, accessing the + * filesystem for external LOB storage as necessary. + * @param conf the Configuration used to access the filesystem + * @param basePath the base directory where the table records are + * stored. + * @return an object that lazily streams the record to the client. + * @throws IOException if it could not read the LOB from external storage. + */ + public ACCESSORTYPE getDataStream(Configuration conf, Path basePath) + throws IOException { + if (isExternal()) { + // Read from external storage. + Path pathToRead = LobReaderCache.qualify( + new Path(basePath, fileName), conf); + LOG.debug("Retreving data stream from external path: " + pathToRead); + if (lobReader != null) { + // We already have a reader open to a LobFile. Is it the correct file? + if (!pathToRead.equals(lobReader.getPath())) { + // No. Close this.lobReader and get the correct one. + LOG.debug("Releasing previous external reader for " + + lobReader.getPath()); + LobReaderCache.getCache().recycle(lobReader); + lobReader = LobReaderCache.getCache().get(pathToRead, conf); + } + } else { + lobReader = LobReaderCache.getCache().get(pathToRead, conf); + } + + // We now have a LobFile.Reader associated with the correct file. Get to + // the correct offset and return an InputStream/Reader to the user. + if (lobReader.tell() != offset) { + LOG.debug("Seeking to record start offset " + offset); + lobReader.seek(offset); + } + + if (!lobReader.next()) { + throw new IOException("Could not locate record at " + pathToRead + + ":" + offset); + } + + return getExternalSource(lobReader); + } else { + // This data is already materialized in memory; wrap it and return. + return getInternalSource(realData); + } + } + + /** + * Using the LobFile reader, get an accessor InputStream or Reader to the + * underlying data. + */ + protected abstract ACCESSORTYPE getExternalSource(LobFile.Reader reader) + throws IOException; + + /** + * Wrap the materialized data in an InputStream or Reader. + */ + protected abstract ACCESSORTYPE getInternalSource(CONTAINERTYPE data); + + /** + * @return the materialized data itself. + */ + protected abstract DATATYPE getInternalData(CONTAINERTYPE data); + + /** + * Make a copy of the materialized data. + */ + protected abstract CONTAINERTYPE deepCopyData(CONTAINERTYPE data); + + public DATATYPE getData() { + if (isExternal()) { + throw new RuntimeException( + "External LOBs must be read via getDataStream()"); + } + + return getInternalData(realData); + } + + @Override + public String toString() { + if (isExternal()) { + return "externalLob(lf," + fileName + "," + Long.toString(offset) + + "," + Long.toString(length) + ")"; + } else { + return realData.toString(); + } + } + + @Override + public void readFields(DataInput in) throws IOException { + // The serialization format for this object is: + // boolean isExternal + // if true, then: + // a string identifying the external storage type + // and external-storage-specific data. + // if false, then we use readFieldsInternal() to allow BlobRef/ClobRef + // to serialize as it sees fit. + // + // Currently the only external storage supported is LobFile, identified + // by the string "lf". This serializes with the filename (as a string), + // followed by a long-valued offset and a long-valued length. + + boolean isExternal = in.readBoolean(); + if (isExternal) { + this.realData = null; + + String storageType = Text.readString(in); + if (!storageType.equals("lf")) { + throw new IOException("Unsupported external LOB storage code: " + + storageType); + } + + // Storage type "lf" is LobFile: filename, offset, length. + this.fileName = Text.readString(in); + this.offset = in.readLong(); + this.length = in.readLong(); + } else { + readFieldsInternal(in); + + this.fileName = null; + this.offset = 0; + this.length = 0; + } + } + + /** + * Perform the readFields() operation on a fully-materializable record. + * @param in the DataInput to deserialize from. + */ + protected abstract void readFieldsInternal(DataInput in) throws IOException; + + @Override + public void write(DataOutput out) throws IOException { + out.writeBoolean(isExternal()); + if (isExternal()) { + Text.writeString(out, "lf"); // storage type "lf" for LobFile. + Text.writeString(out, fileName); + out.writeLong(offset); + out.writeLong(length); + } else { + writeInternal(out); + } + } + + /** + * Perform the write() operation on a fully-materializable record. + * @param out the DataOutput to deserialize to. + */ + protected abstract void writeInternal(DataOutput out) throws IOException; + + + protected static final ThreadLocal EXTERNAL_MATCHER = + new ThreadLocal() { + @Override protected Matcher initialValue() { + Pattern externalPattern = Pattern.compile( + "externalLob\\(lf,(.*),([0-9]+),([0-9]+)\\)"); + return externalPattern.matcher(""); + } + }; + + + +} diff --git a/src/java/org/apache/sqoop/lib/LobSerializer.java b/src/java/org/apache/sqoop/lib/LobSerializer.java new file mode 100644 index 00000000..a30ffe78 --- /dev/null +++ b/src/java/org/apache/sqoop/lib/LobSerializer.java @@ -0,0 +1,54 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.sqoop.lib; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; + +/** + * Serialize LOB classes to/from DataInput and DataOutput objects. + */ +public final class LobSerializer { + + private LobSerializer() { } + + public static void writeClob( + com.cloudera.sqoop.lib.ClobRef clob, DataOutput out) throws IOException { + clob.write(out); + } + + public static void writeBlob( + com.cloudera.sqoop.lib.BlobRef blob, DataOutput out) throws IOException { + blob.write(out); + } + + public static com.cloudera.sqoop.lib.ClobRef readClobFields( + DataInput in) throws IOException { + com.cloudera.sqoop.lib.ClobRef clob = new com.cloudera.sqoop.lib.ClobRef(); + clob.readFields(in); + return clob; + } + + public static com.cloudera.sqoop.lib.BlobRef readBlobFields( + DataInput in) throws IOException { + com.cloudera.sqoop.lib.BlobRef blob = new com.cloudera.sqoop.lib.BlobRef(); + blob.readFields(in); + return blob; + } +} diff --git a/src/java/org/apache/sqoop/lib/ProcessingException.java b/src/java/org/apache/sqoop/lib/ProcessingException.java new file mode 100644 index 00000000..453ed311 --- /dev/null +++ b/src/java/org/apache/sqoop/lib/ProcessingException.java @@ -0,0 +1,47 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.sqoop.lib; + +/** + * General error during processing of a SqoopRecord. + */ +@SuppressWarnings("serial") +public class ProcessingException extends Exception { + + public ProcessingException() { + super("ProcessingException"); + } + + public ProcessingException(final String message) { + super(message); + } + + public ProcessingException(final Throwable cause) { + super(cause); + } + + public ProcessingException(final String message, final Throwable cause) { + super(message, cause); + } + + @Override + public String toString() { + String msg = getMessage(); + return (null == msg) ? "ProcessingException" : msg; + } +} diff --git a/src/java/org/apache/sqoop/lib/RecordParser.java b/src/java/org/apache/sqoop/lib/RecordParser.java new file mode 100644 index 00000000..7c291517 --- /dev/null +++ b/src/java/org/apache/sqoop/lib/RecordParser.java @@ -0,0 +1,371 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.sqoop.lib; + +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.util.ArrayList; +import java.util.List; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.io.Text; + +/** + * Parses a record containing one or more fields. Fields are separated + * by some FIELD_DELIMITER character, e.g. a comma or a ^A character. + * Records are terminated by a RECORD_DELIMITER character, e.g., a newline. + * + * Fields may be (optionally or mandatorily) enclosed by a quoting char + * e.g., '\"' + * + * Fields may contain escaped characters. An escape character may be, e.g., + * the '\\' character. Any character following an escape character + * is treated literally. e.g., '\n' is recorded as an 'n' character, not a + * newline. + * + * Unexpected results may occur if the enclosing character escapes itself. + * e.g., this cannot parse SQL SELECT statements where the single character + * ['] escapes to ['']. + * + * This class is not synchronized. Multiple threads must use separate + * instances of RecordParser. + * + * The fields parsed by RecordParser are backed by an internal buffer + * which is cleared when the next call to parseRecord() is made. If + * the buffer is required to be preserved, you must copy it yourself. + */ +public class RecordParser { + + public static final Log LOG = LogFactory.getLog(RecordParser.class.getName()); + + private enum ParseState { + FIELD_START, + ENCLOSED_FIELD, + UNENCLOSED_FIELD, + ENCLOSED_ESCAPE, + ENCLOSED_EXPECT_DELIMITER, + UNENCLOSED_ESCAPE + } + + /** + * An error thrown when parsing fails. + */ + public static class ParseError extends Exception { + public ParseError() { + super("ParseError"); + } + + public ParseError(final String msg) { + super(msg); + } + + public ParseError(final String msg, final Throwable cause) { + super(msg, cause); + } + + public ParseError(final Throwable cause) { + super(cause); + } + } + + private com.cloudera.sqoop.lib.DelimiterSet delimiters; + private ArrayList outputs; + + + public RecordParser(final com.cloudera.sqoop.lib.DelimiterSet delimitersIn) { + this.delimiters = delimitersIn.copy(); + this.outputs = new ArrayList(); + } + + /** + * Return a list of strings representing the fields of the input line. + * This list is backed by an internal buffer which is cleared by the + * next call to parseRecord(). + */ + public List parseRecord(CharSequence input) + throws com.cloudera.sqoop.lib.RecordParser.ParseError { + if (null == input) { + throw new com.cloudera.sqoop.lib.RecordParser.ParseError( + "null input string"); + } + + return parseRecord(CharBuffer.wrap(input)); + } + + /** + * Return a list of strings representing the fields of the input line. + * This list is backed by an internal buffer which is cleared by the + * next call to parseRecord(). + */ + public List parseRecord(Text input) + throws com.cloudera.sqoop.lib.RecordParser.ParseError { + if (null == input) { + throw new com.cloudera.sqoop.lib.RecordParser.ParseError( + "null input string"); + } + + // TODO(aaron): The parser should be able to handle UTF-8 strings + // as well, to avoid this transcode operation. + return parseRecord(input.toString()); + } + + /** + * Return a list of strings representing the fields of the input line. + * This list is backed by an internal buffer which is cleared by the + * next call to parseRecord(). + */ + public List parseRecord(byte [] input) + throws com.cloudera.sqoop.lib.RecordParser.ParseError { + if (null == input) { + throw new com.cloudera.sqoop.lib.RecordParser.ParseError( + "null input string"); + } + + return parseRecord(ByteBuffer.wrap(input).asCharBuffer()); + } + + /** + * Return a list of strings representing the fields of the input line. + * This list is backed by an internal buffer which is cleared by the + * next call to parseRecord(). + */ + public List parseRecord(char [] input) + throws com.cloudera.sqoop.lib.RecordParser.ParseError { + if (null == input) { + throw new com.cloudera.sqoop.lib.RecordParser.ParseError( + "null input string"); + } + + return parseRecord(CharBuffer.wrap(input)); + } + + public List parseRecord(ByteBuffer input) + throws com.cloudera.sqoop.lib.RecordParser.ParseError { + if (null == input) { + throw new com.cloudera.sqoop.lib.RecordParser.ParseError( + "null input string"); + } + + return parseRecord(input.asCharBuffer()); + } + + // TODO(aaron): Refactor this method to be much shorter. + // CHECKSTYLE:OFF + /** + * Return a list of strings representing the fields of the input line. + * This list is backed by an internal buffer which is cleared by the + * next call to parseRecord(). + */ + public List parseRecord(CharBuffer input) + throws com.cloudera.sqoop.lib.RecordParser.ParseError { + if (null == input) { + throw new com.cloudera.sqoop.lib.RecordParser.ParseError( + "null input string"); + } + + /* + This method implements the following state machine to perform + parsing. + + Note that there are no restrictions on whether particular characters + (e.g., field-sep, record-sep, etc) are distinct or the same. The + state transitions are processed in the order seen in this comment. + + Starting state is FIELD_START + encloser -> ENCLOSED_FIELD + escape char -> UNENCLOSED_ESCAPE + field delim -> FIELD_START (for a new field) + record delim -> stops processing + all other letters get added to current field, -> UNENCLOSED FIELD + + ENCLOSED_FIELD state: + escape char goes to ENCLOSED_ESCAPE + encloser goes to ENCLOSED_EXPECT_DELIMITER + field sep or record sep gets added to the current string + normal letters get added to the current string + + ENCLOSED_ESCAPE state: + any character seen here is added literally, back to ENCLOSED_FIELD + + ENCLOSED_EXPECT_DELIMITER state: + field sep goes to FIELD_START + record sep halts processing. + all other characters are errors. + + UNENCLOSED_FIELD state: + ESCAPE char goes to UNENCLOSED_ESCAPE + FIELD_SEP char goes to FIELD_START + RECORD_SEP char halts processing + normal chars or the enclosing char get added to the current string + + UNENCLOSED_ESCAPE: + add charater literal to current string, return to UNENCLOSED_FIELD + */ + + char curChar = com.cloudera.sqoop.lib.DelimiterSet.NULL_CHAR; + ParseState state = ParseState.FIELD_START; + int len = input.length(); + StringBuilder sb = null; + + outputs.clear(); + + char enclosingChar = delimiters.getEnclosedBy(); + char fieldDelim = delimiters.getFieldsTerminatedBy(); + char recordDelim = delimiters.getLinesTerminatedBy(); + char escapeChar = delimiters.getEscapedBy(); + boolean enclosingRequired = delimiters.isEncloseRequired(); + + for (int pos = 0; pos < len; pos++) { + curChar = input.get(); + switch (state) { + case FIELD_START: + // ready to start processing a new field. + if (null != sb) { + // We finished processing a previous field. Add to the list. + outputs.add(sb.toString()); + } + + sb = new StringBuilder(); + if (enclosingChar == curChar) { + // got an opening encloser. + state = ParseState.ENCLOSED_FIELD; + } else if (escapeChar == curChar) { + state = ParseState.UNENCLOSED_ESCAPE; + } else if (fieldDelim == curChar) { + // we have a zero-length field. This is a no-op. + continue; + } else if (recordDelim == curChar) { + // we have a zero-length field, that ends processing. + pos = len; + } else { + // current char is part of the field. + state = ParseState.UNENCLOSED_FIELD; + sb.append(curChar); + + if (enclosingRequired) { + throw new com.cloudera.sqoop.lib.RecordParser.ParseError( + "Opening field-encloser expected at position " + pos); + } + } + + break; + + case ENCLOSED_FIELD: + if (escapeChar == curChar) { + // the next character is escaped. Treat it literally. + state = ParseState.ENCLOSED_ESCAPE; + } else if (enclosingChar == curChar) { + // we're at the end of the enclosing field. Expect an EOF or EOR char. + state = ParseState.ENCLOSED_EXPECT_DELIMITER; + } else { + // this is a regular char, or an EOF / EOR inside an encloser. Add to + // the current field string, and remain in this state. + sb.append(curChar); + } + + break; + + case UNENCLOSED_FIELD: + if (escapeChar == curChar) { + // the next character is escaped. Treat it literally. + state = ParseState.UNENCLOSED_ESCAPE; + } else if (fieldDelim == curChar) { + // we're at the end of this field; may be the start of another one. + state = ParseState.FIELD_START; + } else if (recordDelim == curChar) { + pos = len; // terminate processing immediately. + } else { + // this is a regular char. Add to the current field string, + // and remain in this state. + sb.append(curChar); + } + + break; + + case ENCLOSED_ESCAPE: + // Treat this character literally, whatever it is, and return to + // enclosed field processing. + sb.append(curChar); + state = ParseState.ENCLOSED_FIELD; + break; + + case ENCLOSED_EXPECT_DELIMITER: + // We were in an enclosed field, but got the final encloser. Now we + // expect either an end-of-field or an end-of-record. + if (fieldDelim == curChar) { + // end of one field is the beginning of the next. + state = ParseState.FIELD_START; + } else if (recordDelim == curChar) { + // stop processing. + pos = len; + } else { + // Don't know what to do with this character. + throw new com.cloudera.sqoop.lib.RecordParser.ParseError( + "Expected delimiter at position " + pos); + } + + break; + + case UNENCLOSED_ESCAPE: + // Treat this character literally, whatever it is, and return to + // non-enclosed field processing. + sb.append(curChar); + state = ParseState.UNENCLOSED_FIELD; + break; + + default: + throw new com.cloudera.sqoop.lib.RecordParser.ParseError( + "Unexpected parser state: " + state); + } + } + + if (state == ParseState.FIELD_START && curChar == fieldDelim) { + // we hit an EOF/EOR as the last legal character and we need to mark + // that string as recorded. This if block is outside the for-loop since + // we don't have a physical 'epsilon' token in our string. + if (null != sb) { + outputs.add(sb.toString()); + sb = new StringBuilder(); + } + } + + if (null != sb) { + // There was a field that terminated by running out of chars or an EOR + // character. Add to the list. + outputs.add(sb.toString()); + } + + return outputs; + } + // CHECKSTYLE:ON + + public boolean isEnclosingRequired() { + return delimiters.isEncloseRequired(); + } + + @Override + public String toString() { + return "RecordParser[" + delimiters.toString() + "]"; + } + + @Override + public int hashCode() { + return this.delimiters.hashCode(); + } +} diff --git a/src/java/org/apache/sqoop/lib/SqoopRecord.java b/src/java/org/apache/sqoop/lib/SqoopRecord.java new file mode 100644 index 00000000..9621ab14 --- /dev/null +++ b/src/java/org/apache/sqoop/lib/SqoopRecord.java @@ -0,0 +1,159 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.sqoop.lib; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.sql.PreparedStatement; +import java.sql.SQLException; +import java.util.Map; + +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.mapreduce.lib.db.DBWritable; + +/** + * Interface implemented by the classes generated by sqoop's orm.ClassWriter. + */ +public abstract class SqoopRecord implements Cloneable, DBWritable, + com.cloudera.sqoop.lib.FieldMappable, Writable { + + public SqoopRecord() { + } + + + public abstract void parse(CharSequence s) + throws com.cloudera.sqoop.lib.RecordParser.ParseError; + public abstract void parse(Text s) + throws com.cloudera.sqoop.lib.RecordParser.ParseError; + public abstract void parse(byte [] s) + throws com.cloudera.sqoop.lib.RecordParser.ParseError; + public abstract void parse(char [] s) + throws com.cloudera.sqoop.lib.RecordParser.ParseError; + public abstract void parse(ByteBuffer s) + throws com.cloudera.sqoop.lib.RecordParser.ParseError; + public abstract void parse(CharBuffer s) + throws com.cloudera.sqoop.lib.RecordParser.ParseError; + public abstract void loadLargeObjects( + com.cloudera.sqoop.lib.LargeObjectLoader objLoader) + throws SQLException, IOException, InterruptedException; + + /** + * Inserts the data in this object into the PreparedStatement, starting + * at parameter 'offset'. + * @return the number of fields written to the statement. + */ + public abstract int write(PreparedStatement stmt, int offset) + throws SQLException; + + /** + * Format output data according to the specified delimiters. + */ + public abstract String toString( + com.cloudera.sqoop.lib.DelimiterSet delimiters); + + /** + * Use the default delimiters, but only append an end-of-record delimiter + * if useRecordDelim is true. + */ + public String toString(boolean useRecordDelim) { + // Method body should be overridden by generated classes in 1.3.0+ + if (useRecordDelim) { + // This is the existing functionality. + return toString(); + } else { + // Setting this to false requires behavior in the generated class. + throw new RuntimeException( + "toString(useRecordDelim=false) requires a newer SqoopRecord. " + + "Please regenerate your record class to use this function."); + } + } + + /** + * Format the record according to the specified delimiters. An end-of-record + * delimiter is optional, and only used if useRecordDelim is true. For + * use with TextOutputFormat, calling this with useRecordDelim=false may + * make more sense. + */ + public String toString( + com.cloudera.sqoop.lib.DelimiterSet delimiters, boolean useRecordDelim) { + if (useRecordDelim) { + return toString(delimiters); + } else { + // Setting this to false requires behavior in the generated class. + throw new RuntimeException( + "toString(delimiters, useRecordDelim=false) requires a newer " + + "SqoopRecord. Please regenerate your record class to use this " + + "function."); + } + } + + @Override + public Object clone() throws CloneNotSupportedException { + return super.clone(); + } + + /** + * Returns an integer specifying which API format version the + * generated class conforms to. Used by internal APIs for backwards + * compatibility. + * @return the API version this class was generated against. + */ + public abstract int getClassFormatVersion(); + + /** + * Use the delegate pattern to allow arbitrary processing of the + * fields of this record. + * @param processor A delegate that operates on this object. + * @throws IOException if the processor encounters an IO error when + * operating on this object. + * @throws com.cloudera.sqoop.lib.ProcessingException if the FieldMapProcessor + * encounters a general processing error when operating on this object. + */ + public void delegate(com.cloudera.sqoop.lib.FieldMapProcessor processor) + throws IOException, com.cloudera.sqoop.lib.ProcessingException { + processor.accept(this); + } + + @Override + /** + * {@inheriDoc} + * @throws RuntimeException if used with a record that was generated + * before this capability was added (1.1.0). + */ + public Map getFieldMap() { + // Default implementation does not support field iteration. + // ClassWriter should provide an overriding version. + throw new RuntimeException( + "Got null field map from record. Regenerate your record class."); + } + + /** + * Allows an arbitrary field to be set programmatically to the + * specified value object. The value object must match the + * type expected for the particular field or a RuntimeException + * will result. + * @throws RuntimeException if the specified field name does not exist. + */ + public void setField(String fieldName, Object fieldVal) { + throw new RuntimeException("This SqoopRecord does not support setField(). " + + "Regenerate your record class."); + } + +}