Derek Gatherer: Quantifying Contagion: Listing 3

This site has been permanently archived. This is a static copy provided by the University of Southampton.

#!/usr/local/bin/perl

# Listing_3.pl

=head1 NOTES

This script calculates the memetic isolations for each chosen trait in Murdock.

As with all the scripts, Perl must be installed.

The usage is:

Place the raw data file (dg_raw_data.txt) in the same directory as the script, then

perl -w Listing_3..pl

The first time the script is run, you'll need to make the distance matrix.
This will be created as a file anthro_dist_table.txt

After you've run the script once, you can access the distances from the file,
so to save time, then remove the "comment outs" on lines 68 and 141.

As with all Perl scripts, they are offered as is, without any warranties concerning
safety, functionality etc

The author welcomes any comments on any aspects of the software.
to: dgatherer2002@yahoo.co.uk
This is an occasionally on-going project and newer versions of code are frequently
available, so if you want the really rough cuts, please ask.

=cut

#==============PRELIMINARIES===============================#

use strict;
my $pi = atan2(1,1) * 4;	        # constant pi
my $r = 6378;		# constant radius of earth (miles)

print "\nDistance maker: \n\nopening input file.............";

open (INFILE, 'dg_raw_data.txt') or die "can't open infile: $!";

my @input = <INFILE>;
my $size = @input;

print "\n\nThe table has $size lines (should say 864 lines, if not wrong input file)."; # just a check for correct input format

print "\n\nmaking array..................";

my @matrix;
my @line;

for(my $entry=1;$entry<=$#input;$entry++)		# each line of the raw data
{
	@line = (split /\t/, $input[$entry]);
	push @matrix, [ @line ];		# make anonymous array of arrays
}

my $av_dist=0;					# the average distance
my $no_of_dists=0;				# counter
my $max_dist=-1000;				# top and bottom dists
my $min_dist=1000;				# arbitrarily low and high
my %dist_hash;					# holds all the distances
my %attributes;					# holds the attribute being measured
my $dims = $#line-1;

#==============CALCULATE DISTANCES===============================#
#=head1       # now calculate the similarity matrix

print "\nopening output file............";

open(OUTFILE, '>anthro_dist_table.txt') or die "can't find the outfile $!\n";

for(my $z=0;$z<=$#matrix;$z++)	# top row
{
        print OUTFILE "\t$matrix[$z][1]";
}

for(my $x=0;$x<=$#matrix-1;$x++)
{
	print OUTFILE "\n$matrix[$x][1]\t";
                print  "\n$matrix[$x][1]\t";

#	for(my $tabs=0;$tabs<=$x;$tabs++)	# for padding triangiular matrix
#	{
#		print OUTFILE "\t";
#	}

                for(my $y=0;$y<=$#matrix;$y++)		# square matrix, or hash
	{
		my $distance=0;		# reset to zero for each pair
		my $a1; # = lat1  in radians
		my $b1; # = lon1  in radians
		my $a2; # = lat2  in radians
		my $b2; # = lon2  in radians

		$matrix[$x][30] =~ /(\d{1,3})([NS])\s+(\d{1,3})([EW])/; # eg 3N 45W
		$a1 = $1*($pi/180);
		my $NorS = $2;
		$a1 *= -1 if $NorS eq "S";
		$b1 = $3*($pi/180);
		my $EorW = $4;
		$b1 *= -1 if $EorW eq "W";

		$matrix[$y][30] =~ /(\d{1,3})([NS])\s+(\d{1,3})([EW])/; # eg 3N 45W
		$a2 = $1*($pi/180);
		$NorS = $2;
		$a2 *= -1 if $NorS eq "S";
		$b2 = $3*($pi/180);
		$EorW = $4;
		$b2 *= -1 if $EorW eq "W";

		$distance = &acos(cos($a1)*cos($b1)*cos($a2)*cos($b2)
					+cos($a1)*sin($b1)*cos($a2)*sin($b2)
					+sin($a1)*sin($a2))*$r;

		if($distance > $max_dist)
		{
			$max_dist = $distance;
		}
		if($distance < $min_dist)
		{
			$min_dist = $distance;
		}
		$av_dist += $distance;
		$no_of_dists++;
		printf (OUTFILE "%.3f", $distance);	# how to make it statistically analysable
		print OUTFILE "\t";
	}
}

print "\nAverage distance is ";
printf '%.3f', $av_dist/$no_of_dists;
print " out of $no_of_dists counted";
print "\nMax: ";
printf '%.3f', $max_dist;
print "\nMin: ";
printf '%.3f', $min_dist;

close OUTFILE;		# seal distance matrix once created
#=cut

#==============CALCULATE CLUSTERING===============================#

for(my $col=7; $col<=29; $col++)      # all columns
{
#	$attributes{$matrix[0][1]} = $matrix[0][$col];

	my @line = split /\t/, $input[0];
	print "\n$line[$col]";                                            # print column titles
	for(my $z=0;$z<=$#matrix;$z++)	# square
	{
		$attributes{$matrix[$z][1]} = $matrix[$z][$col];
	}

        while((my $soc, my $att) = each %attributes)
        {
#        	print "\n$soc\t$att";
        }

        open(MATFILE, 'anthro_dist_table.txt') or die "can't find the outfile $!\n";

        my @reload = <MATFILE>;
        my @dists;

        for(my $entry=0;$entry<=$#reload;$entry++)		# each line of the distance matrix
        {
	@line = (split /\t/, $reload[$entry]);
	push @dists, [ @line ];			# make anonymous array of arrays
        }

        for(my $x=1;$x<=$#dists-1;$x++)
        {
        	for(my $y=1;$y<=$#dists;$y++)		# square matrix, or hash
	{
		$dist_hash{$dists[$x][0]}{$dists[0][$y]} = $dists[$x][$y];	# HoH
                   #            print "\n$dists[$x][0] to $dists[0][$y] is $dists[$x][$y]";
	}
        }

        my %nn;		# hash of nearest neighbours
        my %nn_dist;
        my %nn_same;	# hash of nearest neighbours with identical attribute
        my %nn_same_dist;

        for(my $x=1;$x<=$#dists-1;$x++)
        {
	my $nn_dist = my $nn_same_dist = 2000000;	# arbitrarily high
	for(my $y=1;$y<=$#dists;$y++)		# square matrix, or hash
	{
		if($dist_hash{$dists[$x][0]}{$dists[0][$y]} < $nn_dist
			&& $dists[$x][0] ne $dists[0][$y])
		{
			$nn_dist = $dist_hash{$dists[$x][0]}{$dists[0][$y]};
			$nn{$dists[$x][0]} = $dists[0][$y];
			$nn_dist{$dists[$x][0]} = $nn_dist;
		}
		if($dist_hash{$dists[$x][0]}{$dists[0][$y]} < $nn_same_dist
			&& $attributes{$dists[$x][0]} eq $attributes{$dists[0][$y]}
				&& $dists[$x][0] ne $dists[0][$y]
			     		&& $attributes{$dists[$x][0]} !~ /[\.X]/)
		{
			$nn_same_dist = $dist_hash{$dists[$x][0]}{$dists[0][$y]};
			$nn_same{$dists[$x][0]} = $dists[0][$y];
			$nn_same_dist{$dists[$x][0]} = $nn_same_dist;
		}
	}
        }

        my $valids = 0;			# number of informative comparisons
        my $diff = 0;			# total extra distance needed
        my $yes =0;			# numberof nns that are also same
        for(my $x=1;$x<=$#dists-1;$x++)
        {
	if(defined $nn_same{$dists[$x][0]})
	{
		$diff += $nn_same_dist{$dists[$x][0]} - $nn_dist{$dists[$x][0]};
		$valids++;
		$yes++ if($nn_same_dist{$dists[$x][0]} == $nn_dist{$dists[$x][0]})
	}
        }
        $diff /= $valids;
        print "\nAverage distance to nearest neighbour of same trait type = ";
        printf("%.2f",  $diff);
        print "\nover $valids comparisons";
        print "\nPercentage same nns = ";
        printf("%.2f", $yes/$valids);
        print "\nPercentage isolates = ";
        printf("\t%.2f", (1-($yes/$valids)));
}

# subroutine acos input: an angle in radians
# output: returns the arc cosine of the angle
# description: this is needed because perl does not provide an arc cosine function

sub acos
{
        my($x) = @_;
        $x = sprintf("%.6f", $x);
        my $ret = atan2(sqrt(1 - $x**2), $x);
        return $ret;
}
Button Return to the article