/* -*-scala-*-
*
* Extract data from `tar' archives
*
* (c) 2018 Straylight/Edgeware
*/
/*----- Licensing notice --------------------------------------------------*
*
* This file is part of the Trivial IP Encryption (TrIPE) Android app.
*
* TrIPE is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 3 of the License, or (at your
* option) any later version.
*
* TrIPE is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* for more details.
*
* You should have received a copy of the GNU General Public License
* along with TrIPE. If not, see .
*/
package uk.org.distorted.tripe;
/*----- Imports -----------------------------------------------------------*/
import java.io.{Closeable, InputStream};
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.util.Date;
import sys.FileInfo;
import sys.FileInfo.{Value, FIFO, CHR, DIR, BLK, REG, LNK, HDLNK, UNK};
import Implicits.truish;
/*----- Main code ---------------------------------------------------------*/
class TarFormatError(msg: String) extends Exception(msg);
trait TarEntry {
/* Honestly, I'd rather just have `TarFile#Entry', but Scala doesn't permit
* the trait inheritance circularity. So this is a cardboard cutout
* version of `Entry'.
*/
/* Basic facts about the entry. */
def name: String;
def size: Long;
def rawtyp: Char;
def mode: Int;
def mtime: Date;
def uid: Int;
def gid: Int;
def link: String;
/* Type predicates (intentionally like `FileInfo'). */
def isfifo: Boolean = rawtyp == '6';
def ischr: Boolean = rawtyp == '3';
def isdir: Boolean = rawtyp == '5';
def isblk: Boolean = rawtyp == '4';
def isreg: Boolean = rawtyp match {
case 0 | '0' | '7' => true
case _ => false
}
def islnk: Boolean = rawtyp == '2';
def issock: Boolean = false;
def ishardlink: Boolean = rawtyp == '1';
def typ: FileInfo.Value = rawtyp match {
case 0 | '0' | '7' => REG
case '1' => HDLNK
case '2' => LNK
case '3' => CHR
case '4' => BLK
case '5' => DIR
case '6' => FIFO
case _ => UNK
}
def verbose: String = {
/* Encode information about this tar header as a string. */
val sb = new StringBuilder;
/* First, the type code. */
sb += (rawtyp match {
case 0 | '0' | '7' => '-'
case '1' => 'L'
case '2' => 'l'
case '3' => 'c'
case '4' => 'b'
case '5' => 'd'
case '6' => '|'
case _ => '?'
})
/* Then the permissions bits. Ugh, the permissions bits. */
def perm(s: Int, r: Int, w: Int, x: Int, schar: Char, Schar: Char) {
sb += (if (mode&r) 'r' else '-');
sb += (if (mode&w) 'w' else '-');
sb += (if (mode&s) { if (mode&x) schar else Schar; }
else { if (mode&x) 'x' else '-' });
}
perm(0x800, 0x100, 0x080, 0x040, 's', 'S');
perm(0x400, 0x020, 0x010, 0x008, 's', 'S');
perm(0x200, 0x004, 0x002, 0x001, 't', 'T');
/* And the rest, which is easy. */
sb ++= f" $uid%8d $gid%8d $size%12d $mtime%tFT% T): T = {
val s = stream;
try { body(s) }
finally { s.close(); }
}
}
class TarFile(in: InputStream)
extends LookaheadIterator[TarEntry] with Closeable { tar =>
/* Tokens are just objects, meaningful only for their identity. */
private[TarFile] class Token;
/* Some useful state. */
private[TarFile] var offset: Long = 0; // current byte offset
private[this] var lockp = false; // locked by open entry?
private[this] var locktok = new Token; // active lock token
private[this] var nexthdr: Long = 0; // byte offset of next header
private[this] val hdr = new Array[Byte](512); // header under consideration
/* Making sure we clean up properly. */
override def close() { in.close(); }
override protected def finalize() { super.finalize(); close(); }
private[this] def eoferr()
{ throw new TarFormatError(s"unexpected EOF (at $offset)"); }
/* Locking machinery.
*
* We work from a primitive `InputStream' which we can't seek. From this,
* we must be able to extract file contents, as an `InputStream', and parse
* file headers. We'll be badly lost if we lose track of where we are in
* the archive.
*
* So, there's a lock, which can be held by at most one actor at a time:
* either the `TarFile' itself, while it's (hopefully) reading a header
* block, or by the `Stream' object which lets the caller read an
* individual entry's content. Furthermore, if we start activating the
* per-entry streams out of order, we'll get confused about where we're
* meant to be, so there's also a `token' which represents a participant's
* right to claim the lock. The `TarFile' itself has special privileges
* and doesn't need a token, but the per-entry streams do, and only the
* stream associated with the most recently-read header is allowed to claim
* the lock.
*/
private[this] def lock() {
/* Claim exclusive use of the input stream. */
if (lockp) throw new IllegalArgumentException("tarfile lock still held");
lockp = true;
}
private[TarFile] def lock(tok: Token) {
/* Claim exclusive use of the input stream, passing a token. */
if (tok ne locktok)
throw new IllegalArgumentException("stale lock token");
lock();
}
private[TarFile] def unlock() {
/* Release the input stream so someone else can have a go. */
assert(lockp);
lockp = false;
locktok = new Token;
}
/* Doing I/O on the input stream.
*
* Our `Stream' object sneakily grabs single bytes from the input. Given
* the way Scala works, we can't prevent that, so roll with it.
*/
private[TarFile] def read(buf: Array[Byte], start: Int, len: Int) {
/* Read input data into the indicated region of the buffer. Short reads
* are diagnosed as errors. Advances the cursor.
*/
var pos = start;
val limit = start + len;
while (pos < len) {
val n = in.read(buf, pos, limit - pos);
if (n < 0) eoferr();
pos += n; offset += n;
}
}
private[TarFile] def skip(len: Long) {
/* Skip ahead LEN bytes in the archive. (The int/long discrepancy
* matches Java's bizarre `InputStream' protocol.)
*/
var remain = len;
while (remain > 0) {
val n = in.skip(remain);
if (n > 0) { remain -= n; offset += n; }
else {
/* It's hard to work out whether this means we've hit EOF or not. It
* seems best to check. We must have at least one byte left to skip
* or we wouldn't have started this iteration, so try to read that.
* If that works, then there's more stuff available and skipping
* isn't working, so start to read buffers and discard them.
*/
if (in.read() == -1) eoferr();
remain -= 1; offset += 1;
/* Ugh. So, buffers it is then. */
val buf = new Array[Byte]((remain min 4096).toInt);
while (remain >= buf.length) {
val n = (remain min buf.length).toInt;
read(buf, 0, n);
remain -= n;
}
}
}
}
private[TarFile] class Stream(end: Long, tok: Token) extends InputStream {
/* An input stream for a single archive entry's content. */
/* Claim the lock. If we're stale, this won't work. */
lock(tok);
private[this] var open = true;
private[this] def checkopen() {
/* Complain if the stream is closed. */
if (!lockp) throw new IllegalArgumentException("stream is closed");
}
override def read(): Int = {
/* Read one byte. Don't know why there isn't a default implementation
* of this.
*/
checkopen();
if (offset >= end) -1
else {
val b = in.read();
if (b == -1) eoferr();
offset += 1;
b
}
}
override def read(buf: Array[Byte], start: Int, len: Int): Int = {
/* Read a block. */
checkopen();
if (offset >= end) -1
else {
var n = (len.toLong min (end - offset)).toInt;
tar.read(buf, start, n);
n
}
}
override def close() {
/* Release the lock. */
if (open) { unlock(); open = false; }
}
}
private[this] class Entry(val name: String, val size: Long,
val rawtyp: Char, val mode: Int,
val mtime: Date,
val uid: Int, val gid: Int,
val link: String,
end: Long, tok: Token)
extends TarEntry{
/* See `TarEntry' for why we have this silliness. Most of the work is in
* the constructor above.
*/
lazy val stream: InputStream = new Stream(end, tok);
}
/* Utilities for parsing archive-entry header blocks. */
private[this] def string(off: Int, len: Int): String = {
/* Parse a string from the block header. POSIX.1-2008 says that header
* fields should be ISO/IEC 646, but strange things can turn up
* especially in filenames. I'm going to translate strings according to
* the local character set, because that will map most easily if a
* program tries to write out files from the archive with their
* associated names.
*/
/* First, find the null terminator, if there is one. Scala doesn't make
* this especially easy. Rustle up a view to limit the search.
*/
val bview = hdr.view(off, off + len);
val n = bview.indexOf(0) match {
case -1 => len
case nul => nul
};
/* And then decode the relevant portion of the orignal buffer. */
val dec = Charset.defaultCharset.newDecoder;
val in = ByteBuffer.wrap(hdr, off, n);
dec.decode(in).toString
}
private[this] def number(off: Int, len: Int, max: Long): Long = {
/* Parse a number from the block header. POSIX.1-2008 says that numbers
* are in octal and terminated by space or nul.
*/
var n = 0l; // accumulate the value
for (i <- off until off + len) {
val b = hdr(i);
/* See if we're done now. */
if (!b || b == ' ') return n;
else if (b < '0' || b > '7')
throw new TarFormatError(s"bad octal digit (at ${offset + off + i})");
/* Convert to a digit. */
val m = b - '0';
/* Check for overflow -- without overflowing.
*
* Write max 8 N + M. We overflow if 8 n + m > 8 N + M, i.e., 8 n >
* 8 N + (M - m), so n > N + (M - m)/8. This last calculation is a
* little fraught because Scala has the wrong semantics when dividing
* negative integers.
*/
if (n > max/8 + (8 + max%8 - m)/8 - 1)
throw new TarFormatError(s"number out of range (at ${offset + off})");
/* Accumulate and go round again. */
n = 8*n + (b - '0');
}
unreachable;
}
override protected def fetch(): Option[TarEntry] = {
/* Collect the next archive header and return it as a file entry. */
/* Make sure that we can actually do this. */
withCleaner { clean =>
lock(); clean { unlock(); }
/* Skip ahead to the next header. */
skip(nexthdr - offset);
/* Read the header. The end of the archive is marked by two zero
* blocks, so the archive is broken if there isn't at least one here.
*/
read(hdr, 0, 512);
}
/* If the block is entirely zero-filled then declare this file at an
* end. No good can come from checking the next block.
*/
if (hdr.forall(_ == 0)) return None;
/* Verify the checksum. Pay attention because Java's bytes are
* (idiotically) signed.
*/
var ck: Int = 8*' '; // pretend chksum field is spaces
for (i <- 0 until 148) ck += hdr(i)&0xff;
for (i <- 156 until 512) ck += hdr(i)&0xff;
val wantck = number(148, 8, 0x20000);
if (ck != wantck) {
throw new TarFormatError(
s"invalid checksum $ck /= $wantck (at $nexthdr)");
}
/* Fetch the `magic' and `version' fields. If this is a proper POSIX
* `ustar' file then special processing will apply.
*/
val magic = string(257, 6);
val version = string(263, 2);
val posixp = magic == "ustar" && version == "00";
/* Figure out this entry's name. If this is a POSIX archive, then part
* of the name is stashed at the end of the header because of old, bad
* decisions. But don't look there unless we're sure because old GNU
* `tar' used that space for other things.
*/
val name = {
val tail = string(0, 100);
if (!posixp || !hdr(345)) tail
else {
val prefix = string(345, 155);
prefix + '/' + tail
}
}
/* Read some other easy stuff. */
val mode = number(100, 8, 0xfff).toInt;
val uid = number(108, 8, Int.MaxValue).toInt;
val gid = number(116, 8, Int.MaxValue).toInt;
val typ = hdr(156).toChar;
val mtime = number(136, 12, Long.MaxValue);
/* The size is irrelevant, and possibly even misleading, for some entry
* types. We're not interested, for example, in systems where
* directories need to be preallocated.
*/
val size = typ match {
case '1' | '2' | '3' | '4' | '5' | '6' => 0
case _ => number(124, 12, Long.MaxValue)
}
/* Maybe fetch the link name. */
val link = typ match {
case '1' | '2' => string(157, 100)
case _ => ""
}
/* Figure out where the next header ought to be. */
nexthdr = (offset + size + 511)& -512;
/* Return the finished archive entry. */
Some(new Entry(name, size, typ, mode,
new Date(1000*mtime), uid, gid, link,
offset + size, locktok));
}
}
/* Example:
*
* for (e <- TarFile(new GZIPInputStream(tarball.open())); if e.isreg)
* e withStream { in =>
* val h = java.security.MessageDigest.getInstance("SHA-256");
* for ((buf, n) <- in.blocks) h.update(b, 0, n);
* val hex = new String(h.digest flatMap { _.formatted("%02x") });
* println("s$hex ${e.name}");
* }
*/
/*----- That's all, folks -------------------------------------------------*/