The Original Perl Video Cleaner

The original Perl video cleaner script provided here unedited.

#!/usr/bin/perl -w
#
# This parses mp4, wmv, mkv top level chunks, displays offset and length,
# and if invalid data is added at the end, it can be removed with the -f
# flag.
#
# To use this perl script on windows, strawberryperl from
# strawberryperl.com
#
# perl D:\torrents\videocleaner.pl D:\torrents\clip.mp4
#
# If it shows anything invalid at the end of the file, running it again with
# -f will remove those extra bytes.
#
# perl D:\torrents\videocleaner.pl -f D:\torrents\clip.mp4
#

use File::Copy;
use Fcntl qw(:flock SEEK_END);

my $force = 0;
my $bytescutoff = 1000;


# Help
if ($ARGV[0] eq "-h") {
    print "Usage: $0 [-f] <media filename>\n";
    print "  -f splits invalid trailing bytes out into a separate file.\n";
    exit;
} elsif ($ARGV[0] eq '-f') {
    $force = 1;
    shift @ARGV;
}
$cfn = $ARGV[0];


open(my $fh, "<", $cfn) or die $!;
binmode($fh);
$cfs = -s $fh;

# truncate filename (arg1) of a given size (arg2) by (arg3) bytes.
# returns nothing
sub binTrunc {
    my ($cfn, $cfs, $truncbytes) = @_;

    my $newbytes = $cfs - $truncbytes;
    print "truncating $cfn to $newbytes\n";

    # make sure file is readable
    my $perm = (stat $cfn)[2] & 07777;
    chmod($perm | 0600, $cfn) or die "Unable to make file writable, $!\n";
    # truncate
    truncate($cfn, $cfs-$truncbytes) or die "Unable to truncate file, $!\n";
}


sub backupTag {
    my ($tagfile, $c4stag) = @_;

    my $tagfn = $tagfile;
    my $eidx = 1;

    # find a name that doesn't exist
    while ( -e $tagfn ) {
        $tagfn = "$tagfile.$eidx";
        $eidx++;
    }

    open(my $fho, ">", $tagfn) or die $!;
    binmode($fho);
    print $fho $c4stag;
    close($fho);

    print "Backed up trailing data to ($tagfn)\n";
}

# identify the file type given the first few bytes from the file
# (needs to be passed at least 8 bytes)
# returns "mp4" or "wmv" or "mkv"
sub filetype {
    my ($header) = @_;
    my $ftype;

    # examples
    # ASF "\x30\x26\xb2\x75\x8e\x66\xcf\x11\xa6\xd9\x00\xaa\x00\x62\xce\x6c"
    # MP4 "\x00\x00\x00.ftypisom"
    # Matroska  1a 45 df a3 93 42 82 88 6d 61 74 72 6f 73 6b 61
    #   = 00011010b 45 df a3 10010011b (19)...
    #
    #   00000001b 42 85 81 01 18 53 80 (id)
    #   = 67 01 00 = 01100111b 00000001b = 9985

    if (($header =~ /\A\x00\x00\x00.ftyp(isom|iso2|qt  |mp4[12]|M4V )/) ||
        ($header =~ /\A\x00...moov/)) {
        $ftype = "mp4";
    } elsif (substr($header, 0, 4) eq "\x30\x26\xb2\x75") {
        $ftype = "wmv";
    } elsif (substr($header, 0, 4) eq "\x1a\x45\xdf\xa3") {
        $ftype = "mkv";
    } else {
        $ftype = "unknown";
    }
    return $ftype;
}


# takes file type as argument
sub chunk_min {
    my ($ftype) = @_;
    my $chunkmin;

    if ($ftype eq "mp4") {
        $chunkmin = 8;
    } elsif ($ftype eq "wmv") {
        $chunkmin = 24;
    } elsif ($ftype eq "mkv") {
        $chunkmin = 16;
    } else {
        die "Unknown file header type\n";
    }
}

# takes file type and chunkdata as arguments
sub decode_chunk_type_length {
    my ($ftype, $contfh, $pos, $chunkdata) = @_;

    my $prettypos = sprintf("%10u", $pos);
    my ($chunklength, $chunktype);
    my $chunkextra = 0;

    if ($ftype eq "wmv") {
        # ASF uses little-endian for number storage
        ($chunktype, $chunklength) = unpack('(H32Q)<', $chunkdata);
    } elsif ($ftype eq "mp4") {
        # ISO mpeg uses big-endian for number storage
        ($chunklength, $chunktype) = unpack('(LA4)>', $chunkdata);
        if ($chunklength == 1) {
            # 32 bit size 1 in mp4 means 64 bit size after the chunk type
            my $chunk64size;
            read($contfh, $chunk64size, 8) or die $!;
            ($chunklength) = unpack('(Q)>', $chunk64size);
            $chunkextra += 8;
        }
    } elsif ($ftype eq "mkv") {
        my ($vint1len, $vint2len);
        # unpack mkv chunk id
        ($vint1len, $chunktype) = &ebml_size_unpack($chunkdata);
        if ($vint1len > 0) {
            # unpack mkv chunk size
            ($vint2len, $chunklength) = &ebml_size_unpack(substr($chunkdata, $vint1len, -1));
        }
        if ($vint1len < 1 || $vint2len < 1) {
            print "corrupt mkv block (invalid block size)\n";
            $chunklength = 0;
            $chunktype = "----";
        } else {
            $chunklength += $vint2len + $vint1len;
        }
        $chunktype = sprintf("%x", $chunktype);
    } else {
        die "Unknown file type\n";
    }

    printf("offset %11i length %11i  (type %s)\n", $prettypos, $chunklength, $chunktype);
    return ($chunktype, $chunklength, $chunkextra);
}


sub is_chunk_size_ok {
    my ($ftype, $chunkmin, $chunklength, $remainingbytes) = @_;

    if ($chunklength > $remainingbytes) {
        return 0;
    }

    if ($ftype eq 'mkv') {
        # is 0 if there's a problem with the chunk
        return $chunklength >= 2;
    } else {
        return $chunklength >= $chunkmin;
    }
}

# variable length integer unpacker for matroska (mkv) files only.
# takes binary data/string with a mkv chunk type and variable length integer
# returns a list (chunk type, chunk size)
sub ebml_size_unpack {
    my ($chunkdata) = @_;

    # encoded chunk size does not include elementid and chunk size
    # specs say big endian

    my $rawbytei = ord(substr($chunkdata, 0, 1));
    my $vibytes;

    if ($rawbytei == 0) {
        $vibytes = -1;
    } else {
        $vibytes = 8 - int(log($rawbytei)/log(2));
    }

    # check for missing set bit, or not enough header bytes
    return (-1, -1) if ( ($vibytes == -1) || ($vibytes > length($chunkdata)) );

    my $accu = (0xff >> $vibytes) & $rawbytei;

    my $i = 1;
    while ($i < $vibytes) {
        $accu = $accu * 256 + ord(substr($chunkdata, $i, 1));
        $i++;
    }
    #print "returning $vibytes, $accu\n";
    return ($vibytes, $accu);
}


# Parses the file according to its
# file type, looking for small broken chunks near the end.
#
# Returns the number of tag/junk bytes found at the end of the file, up to the number
# set in the second parameter for safety, so it doesn't truncate half of a file
sub findtagbytes {
    # ISO media file top level structure
    # Series of chunks, 4 byte size (BE), 4 byte chunk name, and data

    # ASF v1 media file top level structure
    # Series of chunks, 16 byte guid, 8 byte chunk length (LE), and data

    my ($contname, $bytescutoff) = @_;

    open(my $contfh, "<", $contname) or die $!;
    binmode($contfh);

    # detect file type
    read($contfh, my $header, 16);
    my $ftype = &filetype($header);
    my $chunkmin = &chunk_min($ftype);

    my $contsize = -s $contfh;

    printf("File type %s, size     (%12i)\n", $ftype, $contsize);

    my $pos = 0;
    # reset file position
    seek($contfh, $pos, 0);  # 0 is SEEK_SET in fcntl module
    while ($pos < $contsize) {
        my $remainingbytes = $contsize - $pos;

        # failsafe position check, in case code is modified and file seeking doesn't
        # work correctly, avoid corrupting files
        my $testpos = tell($contfh);
        if ($testpos != $pos) {
            die "Mismatch between real pos $testpos and assumed pos $pos\n";
        }

        # If not enough room for a valid chunk, print warning and abort
        if ($remainingbytes < $chunkmin) {
            print "small junk section at end ($remainingbytes bytes)\n";
            close($contfh);
            return $remainingbytes;
        }

        # read the chunk header
        my $chunkdata;
        read($contfh, $chunkdata, $chunkmin) or die $!;

        # decode and print chunk header info
        my ($chunktype, $chunklength, $chunkextra) = &decode_chunk_type_length($ftype, $contfh, $pos, $chunkdata);

        $bytes_after = $remainingbytes - $chunklength;
        printf("                        (%12i) bytes remaining", $bytes_after);
        print "\n";

        # check for problems
        if (! &is_chunk_size_ok($ftype, $chunkmin + $chunkextra, $chunklength, $remainingbytes)) {
            if ($remainingbytes > $bytescutoff) {
                # if there are too many remaining bytes, do nothing, to avoid corruption
                print "CAUTION: invalid chunk with >$bytescutoff ($remainingbytes) bytes left in file\n" .
                    "corruption probably unrelated to trailing junk bytes... not modifying\n";
                # set to zero to avoid truncate
                $remainingbytes = 0;
            } else {
                print "  Junk detected at end of file\n";
            }

            close($contfh);
            return $remainingbytes;
        }

        last if ($chunklength == 0);

        $pos += $chunklength;
        seek($contfh, $pos, 0); # 0 is SEEK_SET in module fcntl
    }
    close($contfh);
    return 0;
}


# This section looks for junk or broken top layer chunks
# In addition to what some clip sites add
# Some files uploaded to file locker sites have a few added null characters

my $tagbytes = findtagbytes($cfn, $bytescutoff);

print "\n";

if ($tagbytes > 0) {
    # findtagbytes() found between 1 and $bytescutoff junk bytes at the end of the file
    # not belonging to a valid media file chunk, so remove it
    #
    # Read the trailing bytes into a variable for backup purposes.
    seek($fh, -$tagbytes, SEEK_END);
    read($fh, $c4stag, $tagbytes);

    die "Run with '-f' flag to actually truncate the file." unless $force;

    # Back up the trailing data and remove it by truncating the media file
    &backupTag($cfn . ".tag", $c4stag);
    &binTrunc($cfn, $cfs, $tagbytes);
    print "cleaned $cfn\n";
} else {
    print "No trailing junk data detected (limited to 1-$bytescutoff bytes)\n";
}

close($fh);
Edit
Pub: 30 May 2024 18:03 UTC
Views: 34