small uniq trick

Mon Jan 28 05:06:01 EET 2008

On 2008-01-28 02:25, Giorgos Keramidas <keramida at ceid.upatras.gr> wrote:
> > $ time cat worstcase | cut -f2- -d: |  sort | uniq > /dev/null
> >
> > real    0m2.522s
> > user    0m1.672s
> > sys     0m0.782s

Η πιο «κουρασμένη» απάντηση, επειδή δεν έχω ύπνο απόψε, είναι ότι έχεις
100% δίκιο, αφού έκατσα και το μέτρησα κι εγώ...

Χρήσιμο εργαλείο είναι και το ministat(1) από το BSD, που μπορεί (σε
συνδυασμό με το timeit.pl που είχα γράψει παλιότερα) να τυπώσει πιο
ωραία διαγράμματα όπως αυτά στο τέλος από αυτό το email:

$ cat a.sh
#!/bin/sh

sort -T /var/tmp | uniq

$ cat b.sh
#!/bin/sh

uniq | sort -T /var/tmp | uniq

$ cat dataset.pl
#!/usr/bin/perl -Tw

use strict;

my @values = ('a', 'b', 'c');
my $nvalues = $#values + 1;
my $limit = 10 * 1000 * 1000;

my $k;
for ($k = 0; $k < $limit; $k++) {
    printf "%s\n", $values[int(rand() * $nvalues)];
}

$ cat timeit.pl
#!/usr/bin/perl -Tw
#-
# Copyright (c) 2006, 2007, 2008 Giorgos Keramidas <keramida at FreeBSD.org>
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# 1. Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer
#    in this position and unchanged.
# 2. Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
# IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#

use strict;
use POSIX;

my %children;

sub warning(@) {

    my $msg = join(' ', "WARNING:", @_);
    chomp($msg);
    warn("$msg\n");
    return undef;
}

sub message(@) {

    my $msg = join(' ', @_);
    chomp($msg);
    warn("$msg\n");
}

#
# Spawn a child process, and return a 3-tuple of times (user, sys, real)
# with the time it took to complete.
#
sub spawn($@) {
    my $cmd = shift;        # Command to run
    my @args = @_;          # Arguments

    my ($starttime) = POSIX::times();

    my $pid = fork();
    if (!defined($pid)) {
        return warning("fork():$!");
    } elsif ($pid == 0) {
        exec($cmd, @args);
        die("child: exec(): $!\n");
    }

    $children{$pid} = $pid;
    my $ret = waitpid($pid, 0);
    delete $children{$pid};
    if ($ret == -1) {
        return warning("waitpid(): $!\n");
    } elsif ($? & 0xff) {
        return warning("$cmd caught signal ", $? & 0x7f, "\n");
    } elsif ($? >> 8) {
        return warning("$cmd returned exit code ", $? >> 8, "\n");
    }

    my ($endtime, $user, $system, $cuser, $csystem) = POSIX::times();
    $user += $cuser;
    $system += $csystem;
    my $ticks = POSIX::sysconf(&POSIX::_SC_CLK_TCK);

    my @times = ($user / $ticks, $system / $ticks,
        ($endtime - $starttime) / $ticks);
    return @times;
}

#
# Run a command multiple times, and print (user, system, real) 3-tuples of
# time for each invocation.
#
sub timeit($@) {
    my $iterations = shift; # Number of command iterations.
    my $cmd = shift;        # Command to spawn.
    my @args = @_;          # Arguments to $cmd

    foreach my $k (0..$iterations) {
        my ($user, $sys, $real) = spawn($cmd, @args);
        printf(STDERR "%.2f user %.2f system %.2f real\n", $user, $sys, $real);
    }
    return undef;
}

MAIN: {
    # Clear environment and set timezone
    %ENV = (
        'TZ'            => "UTC",
        'PATH'          => "/usr/bin:/usr/sbin:/bin:/sbin",
    );
    tzset();

    if (!@ARGV) {
        exit(0);
    }

    # Untaint @ARGV items.  This should probably enforce more strict checks to
    # the contents of @ARGV, but it's ok for now.

    my $nsamples = 30;
    if ($ARGV[0] =~ m/^(\d+)$/) {
        $nsamples = $1;
    }
    shift @ARGV;

    my @args = ();
    foreach my $k (0..$#ARGV) {
        if ($ARGV[$k] =~ m/^(.*)$/) {
            $args[$k] = $1;
        }
    }

    my $times = timeit($nsamples, @args);
}

$ perl -Tw timeit.pl 30 ./a.sh < dataset 2>a.dat > /dev/null
$ perl -Tw timeit.pl 30 ./b.sh < dataset 2>b.dat > /dev/null

    # το sed -e 1d παρακάτω είναι `after-the-fact' αλλαγή, επειδή ο
    # πρώτος χρόνος είναι συνήθως τεράστιος σε σχέση με τους άλλους,
    # μέχρι να γεμίσει το buffer cache με τα περιεχόμενα από το input
    # file :(

$ awk '{print $1}' a.dat | sed -e 1d > aa-user
$ awk '{print $3}' a.dat | sed -e 1d > aa-sys
$ awk '{print $5}' a.dat | sed -e 1d > aa-real

$ awk '{print $1}' b.dat > bb-user
$ awk '{print $3}' b.dat > bb-sys
$ awk '{print $5}' b.dat > bb-real

$ ministat -w80 aa-user bb-user
x aa-user
+ bb-user
+--------------------------------------------------------------------------------+
|                                           +                                    |
|                                           +  +                                 |
|                                           +  +  + +                            |
|                                         +++++++++++                            |
|x  x x  x  x x  x x   x  x x   x  x x   *+*++*++*++*x  x  x x  x  x  x x  x x  x|
|                |_______________________AM_|__A__|______________|               |
+--------------------------------------------------------------------------------+
    N           Min           Max        Median           Avg        Stddev
x  30         13.34          19.2        16.375     16.295667     1.7971466
+  30         16.33         17.14        16.735     16.735667    0.23698804
No difference proven at 95.0% confidence

$ ministat -w80 aa-sys bb-sys
x aa-sys
+ bb-sys
+--------------------------------------------------------------------------------+
|      +     +                                                                   |
|++++++++++++++                                                                  |
|+++++*++*+*++*  x x  x x  x x  x x x  x x  x x  x x  x  x x  x  x x  x x  x x  x|
|   |___A___|        |_____________________A_____________________|               |
+--------------------------------------------------------------------------------+
    N           Min           Max        Median           Avg        Stddev
x  30          2.38         16.87          9.51     9.5863333     4.3801429
+  30          1.34          3.97         2.655          2.65     0.8106447
Difference at 95.0% confidence
        -6.93633 +/- 1.62819
        -72.3565% +/- 16.9845%
        (Student's t, pooled s = 3.14983)

$ ministat -w80 aa-real bb-real
x aa-real
+ bb-real
+--------------------------------------------------------------------------------+
|+                                                                           x   |
|+                                                                           x   |
|+                                                                           x   |
|+                                                                           x   |
|+                                                                           x   |
|+                                                                           x   |
|++                                                                          x   |
|++                                                                          x   |
|++                                                                          x   |
|++                                                                          x   |
|++                                                                          x   |
|++                                                                          x   |
|++                                                                          x x |
|++                                                                          x x |
|++                                                                          x x |
|++                                                                          x x |
|++                                                                          x x |
|++                                                                          x xx|
|MA                                                                          MA  |
+--------------------------------------------------------------------------------+
    N           Min           Max        Median           Avg        Stddev
x  30           0.7          0.72           0.7    0.70266667  0.0052083046
+  30          0.11          0.12          0.11         0.114  0.0049827288
Difference at 95.0% confidence
        -0.588667 +/- 0.00263459
        -83.7761% +/- 0.374942%
        (Student's t, pooled s = 0.00509676)

Ειδικά το τελευταίο είναι killer argument για το πόσο πιο efficient
είναι να κάνεις τουλάχιστον ένα πέρασμα με uniq πριν κάνεις sort
τεράστια datasets :)