USENET News article: Tarfix: Manipulate tar(5) archives (was Tar Perl and Compress)

Newsgroup: comp.lang.perl

Newsgroups:	comp.lang.perl
Path:	icdoc!dds
From:	dds@doc.ic.ac.uk (Diomidis Spinellis)
Subject:	Tarfix: Manipulate tar(5) archives (was Tar Perl and Compress)
Message-ID:	<1992Jun17.133044.22193@doc.ic.ac.uk>
Sender:	usenet@doc.ic.ac.uk
Nntp-Posting-Host:	dirty.doc.ic.ac.uk
Organization:	Dept. of Computing, Imperial College, London, UK
References:	<1992Jun16.214035.490@cse.uta.edu>
Date:	Wed, 17 Jun 1992 13:30:44 GMT
Lines:	614
Content-Length:	19564
In article <1992Jun16.214035.490@cse.uta.edu> turbo@cse.uta.edu 
(Chris Turbeville) writes:
[...]
> I'd like to be able to add files to tar.Z files without needing the disk
> space to hold the uncompressed tar.
[...]

You can use tarfix (included bollow) as follows:
( zcat old.tar.Z | tarfix -e ; tar cf newfiles ) | compress >new+old.tar.Z

Diomidis

#! /bin/sh
# This is a shell archive, meaning:
# 1. Remove everything above the #! /bin/sh line.
# 2. Save the resulting text in a file.
# 3. Execute the file with /bin/sh (not csh) to create the files:
#	tarfix
#	tarfix.1
# This archive created: Wed Jun 17 14:28:55 1992
export PATH; PATH=/bin:$PATH
echo shar: extracting "'tarfix'" '(10530 characters)'
if test -f 'tarfix'
then
	echo shar: will not over-write existing file "'tarfix'"
else
sed 's/^X//' << \SHAR_EOF > 'tarfix'
X#!/usr/local/bin/perl
X@REM=("
X@perl -S %0.bat %1 %2 %3 %4 %5 %6 %7 %8 %9
X@goto end ") if 0 ;
X#
X# Fix and convert tape archives
X#
X# (C) Copyright 1990,91 Diomidis Spinellis.  All rights reserved.
X#
X# Permission to use, copy, and distribute this software and its
X# documentation for any purpose and without fee is hereby granted,
X# provided that the above copyright notice appear in all copies and that
X# both that copyright notice and this permission notice appear in
X# supporting documentation.
X# 
X# THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED
X# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
X# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
X#
X# dds@doc.ic.ac.uk
X#
X# This program is not efficient.
X# A have tried instead to make it readable, flexible and easy to use.
X# For this I use lots of subroutines, local variables and higher order 
X# functions.  If you want something efficient rewrite it in C.
X#
X# You can very easily add a new conversion mode.  Just add the conversion
X# function.  If the conversion function needs to be applied to every
X# filename component add a map function and a function to prepare a component
X# to have acounter added to it.
X# If you need to convert another part of the header (e.g. uid) search for the
X# string HEADERMOD in this file.
X
Xdo 'getopts.pl' || die "$0: Unable to find getopts library: $!\n";
X&Getopts('mncaf:t:s:e') || $usage++;
X
X$count = 0;
X$proc = 'copy';
X
Xif ($opt_c) {
X	$proc = 'canonic';
X	$count++;
X}
Xif ($opt_a) {
X	$proc = 'noabs';
X	$count++;
X}
Xif ($opt_f) {
X	$proc = 'from_' . $opt_f;
X	$count++;
X}
Xif ($opt_t) {
X	$proc = 'to_' . $opt_t;
X	$count++;
X}
Xif ($opt_s) {
X	$proc = 'usershort';
X	$usershortlen = $opt_s;
X	$count++;
X}
X
Xif ($count > 1) {
X	print STDERR "$0: Only one of -c -a -f -t -s can be specified\n";
X	$usage++;
X}
X
Xif (! eval('&' . $proc . '("foo");')) {
X	print STDERR "$0: Bad option specified $proc\n";
X	$usage++;
X}
Xdelete $map{'foo'};
X
X
Xif ($usage) {
X	print STDERR "Usage: $0 -n -c -a -s N -f msdos|vms -t msdos|v7";
X	print STDERR "
X	-n Do not work on tar files.  Read and print a list of file names.
X	-c Canonicalise filenames by removing /../, /./ and //.
X	-a Fix absolute filenames by removing leading /.
X	-s Convert filenames to the specified length N.
X	-f Convert from format.  Format can be msdos or vms.
X	-t Convert to format.  Format can be v7 (7th Edition) or msdos.
X	-m Print a map table containing initial and final name on stderr.
X	-e Remove trailing EOF
X
X	Only one of -c -a -f -t -s can be specified.\n";
X	exit 1;
X}
X
Xif ($opt_n) {
X	while (<>) {
X		s/\n$//;
X		print &$proc($_), "\n";
X	}
X} else {
X	&copytar();
X}
X
Xif ($opt_m) {
X	while (($from, $to) = each(%map)) {
X		print STDERR "$from $to\n";
X	}
X}
X
Xexit 0;
X
X# Remove absolute file names.
X# We canonicalise since foo//bar is /bar on many Unixes
Xsub noabs {
X	local($name) = $_[0];
X
X	 $name = &canonic($name);
X	 $name =~ s/^\///;
X	 return $name;
X}
X
X# Convert to MS-DOS
X# - Shorten name to 8 characters
X# - Remove all dots, but the last one
X# - Shorten extension to 3 characters
X# - Convert ,=+<>|; *?:[]\" to ^
X# - Convert device name (con, aux ...) to _device
Xsub to_msdos {
X	local($nm);
X
X	$nm = $_[0];
X	return &filemap($nm, 'mapmsdos', 'countprepmsdos');
X	# The following line fails on perl 3.0 patchlevel 18 XXX
X	# return &filemap($_[0], 'mapmsdos', 'countprepmsdos');
X}
X
X# Shortening routine for MS-DOS
Xsub mapmsdos {
X	local($name) = $_[0];
X	local($ext);
X
X	# Leave only the last dot
X	while ($name =~ s/(.*)\.(.*)\.(.*)/\1_\2.\3/g) { 
X		;
X	}
X	# Convert funny characters to ^
X	$name =~ s/[,=+<>|; *?:[\]\\]/^/g;
X	# Shorten the name
X	if ($name =~ m/\./) {
X		($name, $ext) = split(/\./, $name);
X		$name =~ s/^((con)|(aux[1-4]?)|(prn)|(lpt[1-3])|(clock\$))$/_\1/i;
X		return &shorten($name, 8) . '.' . &shorten($ext, 3);
X	} else {
X		$name =~ s/^((con)|(aux[1-4]?)|(prn)|(lpt[1-3])|(clock\$))$/_\1/i;
X		return &shorten($name, 8);
X	}
X}
X
X# Count preparation routine for MS-DOS
Xsub countprepmsdos {
X	local($name) = $_[0];
X	local($ext);
X	
X	if ($name =~ m/\./) {
X		($name, $ext) = split(/\./, $name);
X		return $name . '.' . substr($ext, 0, 1);
X	} else {
X		return $name . '.';
X	}
X}
X
X
X# Convert to 7th Edition type filesystems
X# - Shorten filenames to 14 characters
Xsub to_v7 {
X	local($nm);
X
X	$nm = $_[0];
X	return &filemap($nm, 'mapv7', 'countprepv7');
X}
X
X# Shortening routine for V7
Xsub mapv7 {
X	return &shorten($_[0], 14);
X}
X
X# Count preparation routine for V7
Xsub countprepv7 {
X	return substr($_[0], 0, 12);
X}
X
X# Shorten the filename components by a user specified amount
Xsub usershort {
X	local($nm);
X
X	$nm = $_[0];
X	return &filemap($nm, 'mapusershort', 'countprepusershort');
X}
X
X# Shortening routine for usershort
Xsub mapusershort {
X	return &shorten($_[0], $usershortlen);
X}
X
X# Count preparation routine for usershort
Xsub countprepusershort {
X	return substr($_[0], 0, $usershortlen);
X}
X
X# Convert from VMS
X# - Convert uppercase to lowercase
X# - Remove leading device name: or node::
X# - Convert directory form [xxx] to xxx/
X# - Remove trailing generation number
X# - Remove quoting characters ^V and " (XXX)
X# NOTE:  I am an ignorant on VMS, so this probably need fixing.  UNTESTED
Xsub from_vms {
X	local($name) = $_[0];
X
X	$name =~ tr/[A-Z]\\/[a-z]\//;
X	$name =~ s/^[a-z]*::?//;
X	$name =~ s/\[(.*)\](.*)/\1\/\2/;
X	$name =~ s/;[0-9]+$//;
X	$name =~ s/["\026]//g;
X	return $name;
X}
X
X# Convert from MS-DOS
X# - Convert \ to /
X# - Convert uppercase to lowercase
X# - Remove leading device names
Xsub from_msdos {
X	local($name) = $_[0];
X
X	$name =~ tr/[A-Z]\\/[a-z]\//;
X	$name =~ s/^[a-z]://;
X	return $name;
X}
X
X# filemap(name, mapfunc, countprepfunc)
X# Go through every path element of name substituting it with the result
X# of mapfunc(element).  If the filename is already used then substitute it
X# with the result of applying countprepfunc to with a two letter counter 
X# appended.
X# Two associative arrays are kept to avoid the chance of re-using a name
X# %map contains the mappings from big names to small names
X# %used contains 1 for every short name that has been used
X# We keep partial file names to speed up the process
X# The filenames are always canonicalised
Xsub filemap {
X	local(
X		@big,		# Contains components of original
X		@small,		# Result is built in here
X		@bigpart,	# Part of big that has been done
X		@s,		# To try alternative mappings
X		$name,		# Part of path we are dealing with
X		$count,		# To create distinct names
X		$try,		# Remember map result
X		$mapfunc,	# Function to create new elements
X		$countprepfunc	# Function to prepare for counting
X
X	);
X
X	$mapfunc = $_[1];
X	$countprepfunc = $_[2];
X	@big = split(/\//, &canonic($_[0]));
X	@small = @bigpart = ();
X	while (defined($name = shift(@big))) {
X		push(@bigpart, $name);
X		if (defined($try = $map{join('/', @bigpart)})) {
X			# Found in map
X			@small = split(/\//, $try);
X			# The next line is needed because of buggy split
X			# split(/x/, '') should give ('') not ()
X			@small = ('') if $#small == -1;
X		} else {
X			# Create new map
X			# Even if the name is short we may have used it up
X			# by shortening up a bigger one, so we may have to
X			# count
X			$name = &$mapfunc($name);
X			$count = '';
X			while ($used{join('/', @s = (@small, $name . $count))}) {
X				if ($count eq '') {
X					$name = &$countprepfunc($name);
X					$count = 'AA';
X				} else {
X					$count++;
X				}
X			}
X			@small = @s;
X			$used{join('/', @small)} = 1;
X			$map{join('/', @bigpart)} = join('/', @small);
X		}
X	}
X	return join('/', @small);
X}
X
X#
X# Convert a single string to something close to it with length up
X# to length given
Xsub shorten {
X	local($str, $len) = @_;
X
X	# Do "fonetic speling" from end to beginning
X	while (
X		length($str) > $len && (
X			$str =~ s/(.*)([fglmnprst])\2(.*)/\1\2\3/i ||
X			$str =~ s/(.*)(ou)(.*)/\1u\3/i ||
X			$str =~ s/(.*)(ck)(.*)/\1k\3/i ||
X			$str =~ s/(.*)(ks)(.*)/\1x\3/i ||
X			$str =~ s/(.*)(sh)(.*)/\1s\3/i ||
X			$str =~ s/(.*)(ph)(.*)/\1f\3/i ||
X			$str =~ s/(.*)(oo)(.*)/\1u\3/i
X		)
X	) { ; }
X	# Remove lowercase vowels from the end to the beginning
X	while (
X		length($str) > $len && 
X		$str =~ s/(.*)[aeiou](.*)/\1\2/
X	) { ; }
X	# Remove uppercase vowels from the end to the beginning
X	while (
X		length($str) > $len && 
X		$str =~ s/(.*)[AEIOU](.*)/\1\2/
X	) { ; }
X	# Finally cut characters from the end
X	$str = substr($str, 0, $len);
X	return $str;
X}
X
X# Create a canonic file name out of one containing .. and . 
X# Employ Unix semantics: empty file means root directory.
Xsub canonic {
X	local(@comp, @can);
X
X	@comp = split(/\//, $_[0]);
X	for ($i = 0; $i <= $#comp; $i++) {
X		if ($comp[$i] eq '.') {
X			;
X		} elsif ($comp[$i] eq '') {
X			@can = ();
X			push(@can, '');
X		} elsif ($comp[$i] eq '..') {
X			pop(@can);
X		} else {
X			push(@can, $comp[$i]);
X		}
X	}
X	return join('/', @can);
X}
X
X# A do nothing procedure
Xsub copy {
X	return $_[0];
X}
X
X# Copy a tape archive from stdin to stdout
Xsub copytar {
X	binmode STDIN;
X	binmode STDOUT;
X
X	forloop: for(;;) {
X		read(STDIN, $header, 512) == 512 || die "$0: Couldn't read header: $!\n";
X		if ($header eq "\0" x 512) {
X			last forloop;
X		}
X		($name, $mode, $uid, $gid, $size, $mtime, $checksum, $linkflag, $linkname) = unpack("a100 A7x A7x A7x A12 A12 a8 a1 a100", $header);
X		#
X		# Header modification code should be put here
X		# HEADERMOD
X		$name =~ s/[\000 ]*//g;
X		$name = &$proc($name);
X		if ($linkflag != "\0") {
X			$linkname =~ s/[\000 ]*//g;
X			$linkname = &$proc($linkname);
X		}
X
X		# Create dummy header for checksum calculation (checksum is blanks)
X		$hnew = pack("a99x A6a1x A6a1x A6a1x A11a1 A11a1 a8 a1 a99x x255", 
X			($name, $mode, ' ', $uid, ' ', $gid, ' ', $size, ' ', $mtime, ' ', ' ' x 8, $linkflag, $linkname));
X		$sz = $size;
X		$sz =~ s/ *//g;
X		$sz = oct($sz);
X		$checksum =~ s/ *//g;
X		$checksum = oct($checksum);
X		$newcheck = &check($hnew);
X		# Create the header with the new checksum
X		$hnew = pack("a99x A6a1x A6a1x A6a1x A11a1 A11a1 a6xa1 a1 a99x x255", 
X			($name, $mode, ' ', $uid, ' ', $gid, ' ', $size, ' ', $mtime, ' ', sprintf('%6o', $newcheck), ' ', $linkflag, $linkname));
X		print STDOUT $hnew;
X		# Copy contents
X		for ($i = 0; $i < $sz; $i += 512) {
X			read(STDIN, $contents, 512) == 512 || die "$0: Couldn't read data: $!\n";
X			print STDOUT $contents;
X		}
X		#seek(STDIN, (int($sz / 512) + 1) * 512, 1) unless $sz == 0;
X	}
X	# Write EOF
X	if (!$opt_e) {
X		print STDOUT pack("x512", ());
X		print STDOUT pack("x512", ());
X	}
X}
X
X# Return checksum for tar header block
Xsub check {
X	$h = $_[0];
X	local($i, $s);
X
X	$s = 0;
X	for($i = 0; $i < 512; $i++) {
X		$s += unpack('C', substr($h, $i, 1));
X	}
X	return $s;
X}
X
X"
X:end ", 0;
SHAR_EOF
if test 10530 -ne "`wc -c < 'tarfix'`"
then
	echo shar: error transmitting "'tarfix'" '(should have been 10530 characters)'
fi
chmod +x 'tarfix'
fi # end of overwriting check
echo shar: extracting "'tarfix.1'" '(6300 characters)'
if test -f 'tarfix.1'
then
	echo shar: will not over-write existing file "'tarfix.1'"
else
sed 's/^X//' << \SHAR_EOF > 'tarfix.1'
X.\" (C) Copyright 1990,91 Diomidis Spinellis.  
X.\" 
X.\" Permission to use, copy, and distribute this software and its
X.\" documentation for any purpose and without fee is hereby granted,
X.\" provided that the above copyright notice appear in all copies and that
X.\" both that copyright notice and this permission notice appear in
X.\" supporting documentation.
X.\" 
X.\" THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED
X.\" WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
X.\" MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
X.\"
X.TH TARFIX 1 "9 December 1991"
X.DA 17 May 1990
X.SH NAME
Xtarfix \- fix and convert tape archives
X.SH SYNOPSIS
X\fBtarfix\fP \-[\fBacnme\fP] 
X[\fB\-s\fP \fIN\fP]
X[\fB\-f\fP \fImsdos\fP or \fIvms\fP]
X[\fB\-t\fP \fImsdos\fP or \fIv7\fP]
X.SH DESCRIPTION
X\fITarfix\fP is a filter used to fix and convert \fItar\fP(5) archives.
XWithout any options \fItarfix\fP reads a tape archive from standard input
Xand prints it on standard output fixing any wrong checksums found.  This
Xis useful after editing a tape archive using a binary editor.
X.SH OPTIONS
X.IP "\fB\-a\fP"
XFix an archive containing absolute filenames.  The leading \fB/\fP from
Xevery filename is removed.
X.IP "\fB\-c\fP"
XCreate canonical filenames.  Any relative parts in a file name like \fB/../\fP,
X\fB/./\fP and \fB//\fP are removed.  Note that \fB//\fB is interpreted as the
Xroot directory.
X.IP "\fB\-e\fP"
XRemove end of file header indicators and data.  This can be used to
Xcreate an archive that can have other archives concatenated to it.
X.IP "\fB\-n\fP"
XDo not perform the conversion on a tape archive.  A list of filenames will be
Xread from the standard input, will be converted as specified by other options
Xand will be printed on standard output.  This is useful for creating
Xshell scripts to perform the conversion or for dealing with
Xother archiving programs.
X.IP "\fB\-m\fP"
XPrint a map of all filenames converted on standard error.  Such a map
Xis needed when trying to decipher what each filename used to be called,
Xin order to fix makefiles etc.
X.IP "\fB\-s\fP \fIN\fP"
XShorten all filename components in the archive to length \fIN\fP.  The 
Xshortening algorithm is discussed later.
X.IP "\fB\-f\fP \fImsdos\fP"
XAssume that the archive to be converted comes from an \fIMS-DOS\fP system.
XThe following conversions will be performed:
X.IP
X\(bu All backslashes will be converted to slashes.
X.IP
X\(bu Uppercase characters will be converted to lowercase.
X.IP
X\(bu A leading drive name followed by a colon will be removed.
X.IP "\fB\-f\fP \fIvms\fP"
XAssume that the archive to be converted comes from a \fIVMS\fP system.
XThe following conversions will be performed:
X.IP
X\(bu Uppercase characters will be converted to lowercase.
X.IP
X\(bu A leading drive name followed by a colon or node name followed by
Xa double colon will be removed.
X.IP
X\(bu A directory name of the form \fI[xxxx]\fP will be converted 
Xto \fIxxxx/\fP.
X.IP
X\(bu A trailing semicolon followed by a generation number will be removed.
X.IP
X\(bu Double quotes and \fI^V\fP characters will be removed.
X.IP "\fB\-t\fP \fImsdos\fP"
XConvert the archive to \fIMS-DOS\fP conventions.
XThe following conversions will be performed:
X.IP
X\(bu Each filename component will have all the dots, but one removed to
Xconvert it to a name followed by an optional extension.
X.IP
X\(bu The name will be shortened to 8 characters.
X.IP
X\(bu The extension will be shortened to three characters.
X.IP
X\(bu Any one of the ``\fB,=+<>|; *?:[]\\"\fP'' characters will be converted to
Xa ``\fB^\fP''.
X.IP
X\(bu If a name is any of the device names \fIcon, aux1, aux2, aux3, aux4, 
Xprn, clock$, lpt1, lpt2\fP or \fIlpt3\fP it will have an underline prepended
Xto it.
X.IP "\fB\-t\fP \fIv7\fP"
XConvert the archive to the \fIseventh edition Unix\fP naming conventions.
XAll components of a filename will be shortened to 14 characters.
X.LP
XOnly one of the conversion options should be given.  All conversions
Xcreate canonical file names.
X.LP
X\fITarfix\fP tries quite hard to retain meaningful filenames when it
Xshortens them.  The shortening algorithm is a many pass one, on each
Xpass less meaning from the filename is preserved.  
XAll transformations are done starting from the end of the name and
Xmoving towards the beginning.
XFirst a series of
Xphonetic transformations are prerformed.  Double letters are removed
Xand the pairs \fIou, ck, ks, sh, ph\fP and \fIoo\fP are substituted
Xwith \fIu, k, x, s, f\fP and \fIu\fP respectively.  If this procedure
Xdoes not produce a short enough filename then vowels will start
Xgetting removed, first lowercase, then uppercase.  Finally the string 
Xis trimmed by removing extraneous characters from its end.
X.LP
X\fITarfix\fP remembers all the filenames converted and will not map
Xtwo different filenames to the same one.  The letters \fIAA\fP will
Xbe appended to dublicate filename components created and these will
Xbe inscremented with every name clash.
X.SH EXAMPLES
X``dd if=/dev/rmt8 ibs=20b | tarfix -m -t v7 2> map | tar xf -'' will 
Xread an archive
Xcontaining long filenames on a short filename file system (e.g. Minix).  At
Xthe end, the file \fImap\fP will contain a list of filenames as they were
Xin the archive and the name with which each was extracted.
X.LP
X``find . -print | tarfix -s 12 -n | sed -e 's/^/mv /' | sh'' converts
Xall filenames in a directory hierarchy to use up to 12 characters.
X.LP
X``tar cf uupc | tarfix -t msdos | dd obs=20b of=/dev/rmt8'' will create 
Xa tape archive that can be easily extracted on an \fIMS-DOS\fP system.
X.LP
X``(rsh host1 'tar cf - dir1 | tarfix -e' ; rsh host2 tar cf - dir2) | dd 
Xobs=20b of=/dev/rmt8'' will put onto tape as one archive directories from
Xtwo different hosts.
X.SH AUTHOR
XDiomidis Spinellis <dds@cc.ic.ac.uk>
X.SH "SEE ALSO"
Xtar(1), tar(5), dd(1), find(1), cpio(1)
X.SH DIAGNOSTICS
XWill complain if it can not read the a header or data block of an archive.
X.SH BUGS
X.LP
XRead and write operations are not done in blocksize boundaries; use dd(1)
Xwhen input or output is a magnetic tape.
X.LP
XThis is a beta release.  More extensive testing is needed.
X.LP
XIt is assumed that the filenames are written in English.
X.LP
XThe author has never in his life seen a tape archive created on VMS.
X.LP
XNot the fastest program around, but the price is right.
SHAR_EOF
if test 6300 -ne "`wc -c < 'tarfix.1'`"
then
	echo shar: error transmitting "'tarfix.1'" '(should have been 6300 characters)'
fi
fi # end of overwriting check
#	End of shell archive
exit 0
-- 
Diomidis Spinellis    Internet: <dds@doc.ic.ac.uk>    UUCP: ...!ukc!icdoc!dds
Department of Computing, Imperial College, London SW7     #include "/dev/tty"
Newsgroup comp.lang.perl contents
Newsgroup list
Diomidis Spinellis home page
Unless otherwise expressly stated, all original material on this page created by Diomidis Spinellis is licensed under a Creative Commons Attribution-Share Alike 3.0 Greece License.