BioPHP - Miscellaneus Class (original)
Original code submitted by josebaCode bellow is covered by GNU GPL v2 license.
Description
Last change: 2005/12/29 14:33 | Recent Changes(short for \"et cetera\") Contains miscellaneous code
Code
Last change: 2005/12/29 14:33 | Download original | Recent Changes | Original code<?php
/* ETC.INC contains definitions for the SubMatrix and SeqMatch classes.
It also contains helper functions such as is_blank(), isa_qualifier(),
firstchar(), left(), etc. */
$patterndb = array(\"_StartCodon\" => \"AUG\", \"_EndCodon\" => \"[UAA,UAG,UGA]\");
// trim_element() removes leading and trailing spaces from a string. In conjunction
// with the array_walk() function, it removes spaces from each element of an array.
function trim_element(&$value, $key)
{
$value = trim($value);
}
/*
is_false() tests if a value is a boolean false and not a zero (0). This is necessary
to correctly interpret the return value of some PHP functions like strpos(). strpos()
returns a zero (0) if a string is found at the beginning of a larger string, and FALSE
if it cannot find that string within the larger string. In PHP, FALSE equals 0.
*/
function is_false($value)
{
if ( (gettype($value) == \"boolean\") and
($value == FALSE) ) return TRUE;
else return FALSE;
}
// rem_right() removes $charcount characters from the right (end) of a string.
function rem_right($str, $charcount = 1)
{
return substr($str, 0, strlen($str)-$charcount);
}
// intrim() removes \"internal spaces\" (as opposed to leading and trailing spaces) from a string.
function intrim($string)
{
return eregi_replace(\' \', \'\', $string);
}
// getmin() gets the minimum of three (usually numeric) values $x, $y, and $z.
// For now, this can\'t handle situations when one or more arguments is FALSE.
function getmin($x, $y, $z)
{
if ($x < $y)
if ($x < $z) return $x;
else return $z;
else
if ($y < $z) return $y;
else return $z;
}
// is_even() tests if an integer is an even number.
function is_even($integer)
{
if (($integer/2) == ((int) ($integer/2))) return TRUE;
else return FALSE;
}
// is_odd() tests if an integer is an odd number. This is the opposite of is_even().
function is_odd($integer)
{
if (($integer/2) != ((int) ($integer/2))) return TRUE;
else return FALSE;
}
// is_blankstr() tests if a value is a blank string (\"\"). Like is_false(), this
// helps interpret the value of some PHP functions or expressions.
function is_blankstr($var)
{
if ( (gettype($var) == \"string\") and ($var == \"\") ) return TRUE;
else return FALSE;
}
// I think this function should give way to or be replaced by is_blankstr().
// I haven\'t removed this yet as I have to check if some code still uses it.
function is_blank($str)
{
if ($str == \"\") return true;
else return false;
}
// firstchar() returns the first or beginning character of a string.
function firstchar($str)
{
return left($str, 1);
}
// left() returns the first $numchars characters of a string.
function left($str, $numchars)
{
return substr($str, 0, $numchars);
}
// right() returns the substring beginning at $numchars characters from the right end of a string.
function right($str, $numchars)
{
return substr($str, strlen($str)-$numchars);
}
// compare_letter() compares two letters $let1 and $let2 and returns another letter
// indicating if the two were exact matches, partial matches, or non-matches.
function compare_letter($let1, $let2, $matrix, $equal, $partial = \"+\", $nomatch = \".\")
{
global $chemgrp_matrix;
// if no custom substitution matrix was provided, use the default.
if (isset($matrix) == FALSE) $matrix = $chemgrp_matrix->rules;
// if no symbol for exact matches was provided, use the residue symbol.
if (isset($equal) == FALSE) $equal = $let1;
if ($let1 == $let2) return $equal;
elseif (partial_match($let1, $let2, $matrix)) return $partial;
else return $nomatch;
}
/* Algorithm:
We abbreviate substitution matrix to \"submatrix\". Each element in a submatrix is an array of
symbols that are considered \"partial matches\" of each other.
Default submatrix:
( (\'G\',\'A\',\'V\',\'L\',\'I\'), (\'S\',\'T\'), (\'N\',\'Q\'), (\'F\',\'Y\',\'W\'), (\'C\', \'M\'), (\'P\'), (\'D\',\'E\'), (\'K\',\'R\',\'H\'),
(\'*\'), (\'X\') )
1) Check if both $let1 and $let2 appear in the first element (G,A,V,L,I) of the substitution matrix.
2) If they are, you\'ve found a \"hit\", and $let1 and $let2 are partial matches. Return a TRUE value.
If they are not, then go to the next element in the substitution matrix.
Repeat steps 1 and 2 until you reach a submatrix element where both $let1 and $let2 appear, or
until the last element in the submatrix has been checked.
3) If you reach the last submatrix element without a \"hit\", return a FALSE value.
NOTE: This will not warn if you $let1 and/or $let2 is nowhere to be found in the whole submatrix.
*/
function partial_match($let1, $let2, $matrix)
{
global $chemgrp_matrix;
if (isset($matrix) == FALSE) $matrix = $chemgrp_matrix->rules;
foreach($matrix as $rule)
if ((in_array($let1, $rule)) and (in_array($let2, $rule))) return TRUE;
return FALSE;
}
// getpattern() retrieves the pattern string from the pattern database ($patternDB array).
function getpattern($pattern)
{
global $patterndb;
return $patterndb[$pattern];
}
// This class allows the use of customized substitution matrices. See tech doc for details.
class submatrix
{
var $rules;
// submatrix simply initializes the rules property to the empty array.
function submatrix()
{
$this->rules = array();
}
// addrule() adds a rule to the substitution matrix.
function addrule($x)
{
$x = func_get_args();
// if (isset($this->rules) == FALSE) $this->rules = array();
array_push($this->rules, $x);
}
}
class SeqMatch
{
var $result;
var $hamdist;
var $levdist;
// hamdist() computes the Hamming Distance between two strings or Seq objects
// of equal length. For more information, consult the technical reference.
function hamdist($seq1, $seq2)
{
// If $seq1 is a Seq object, we use its sequence property to compute Hamming Distance.
if (gettype($seq1) == \"object\") $string1 = $seq1->sequence;
elseif (gettype($seq1) == \"string\") $string1 = $seq1;
// If $seq2 is a Seq object, we use its sequence property to compute Hamming Distance.
if (gettype($seq2) == \"object\") $string2 = $seq2->sequence;
elseif (gettype($seq2) == \"string\") $string2 = $seq2;
// We terminate code execution if the two strings differ in length.
if (strlen($string1) != strlen($string2))
die(\"Both sequence must be of the same length!\");
$len = strlen($string1);
// Initialize the hamming distance to 0 (no difference between two strings).
$distance = 0;
// Match the two strings, character by character. If they are NOT
// identical, increment $distance by 1.
for($i = 0; $i < $len; $i++)
{
$let1 = substr($string1, $i, 1);
$let2 = substr($string2, $i, 1);
if ($let1 != $let2) $distance++;
}
return $distance;
}
// levdist() computes the Levenshtein Distance between two strings or Seq objects
// with equal/unequal lengths. You can pass custom values for cost of insertion,
// replacement, and deletion. If you don\'t pass any, they are assumed to be 1.
// For more information, see technical reference.
function levdist($seq1, $seq2, $cost_ins = 1, $cost_rep = 1, $cost_del = 1)
{
// If $seq1 is a Seq object, we use its sequence property to compute Levenshtein Distance.
if (gettype($seq1) == \"object\") $string1 = $seq1->sequence;
elseif (gettype($seq1) == \"string\") $string1 = $seq1;
// If $seq2 is a Seq object, we use its sequence property to compute Levenshtein Distance.
if (gettype($seq2) == \"object\") $string2 = $seq2->sequence;
elseif (gettype($seq2) == \"string\") $string2 = $seq2;
// Check the lengths of the two strings. If they exceed 255 characters, terminate code.
if (strlen($string1) > 255) die(\"String length must not exceed 255 characters!\");
if (strlen($string2) > 255) die(\"String length must not exceed 255 characters!\");
// Compute and return the Levenshtein Distance using PHP\'s built-in levenshtein() function.
return levenshtein($string1, $string2, $cost_ins, $cost_rep, $cost_del);
}
// xlevdist() is an extended version of levdist() which accepts strings with length
// greater than 255 but not to exceed 1024 (which takes my CPU 18 seconds to compute).
// The only drawback to xlevdist is that the cost of insertion, deletion, and replacement
// is fixed to 1. I have yet to find a way to allow custom values for these.
function xlevdist($s, $t)
{
$n = strlen($s);
$m = strlen($t);
if (($n > 1024) or ($m > 1024)) die(\"String length must not exceed 1024 characters\");
// initialize the array
$values = array();
$temp = array();
$temp[0] = 0;
for($j = 1; $j <= $m; $j++)
$temp[$j] = 0;
$values[0] = $temp;
for($i = 1; $i <= $n; $i++)
$values[$i] = $temp;
for($i = 1; $i <= $n; $i++)
{ // OPENS for($i = 1; $i <= $n; $i++)
$lets = substr($s, $i-1, 1);
for($j = 1; $j <= $m; $j++)
{ // OPENS for($j = 1; $j <= $m; $j++)
$lett = substr($t, $j-1, 1);
if ($lets == $lett) $cost = 0;
else $cost = 1;
// \"normal\" values of $up, $left, and $upleft
if ($j > 1) $up = $values[$i][$j-1];
else $up = FALSE;
if ($i > 1) $left = $values[$i-1][$j];
else $left = FALSE;
if (($i > 1) and ($j > 1)) $upleft = $values[$i-1][$j-1];
else $upleft = FALSE;
if ($i == 1)
{
if ($j == 1) $value = $cost;
elseif ($cost == 0) $value = $cost;
else $value = $up + 1;
}
else
{
// if at the first or topmost row, there is no upleft and above.
if ($j == 1)
{
if ($cost == 0) $value = $cost;
else $value = $left + 1;
}
else $value = getmin($up + 1, $left + 1, $upleft + $cost);
}
$values[$i][$j] = $value;
} // CLOSES for($j = 1; $j <= $m; $j++)
} // CLOSES for($i = 1; $i <= $n; $i++)
return $values[$n][$m];
} // closes function xlevdist()
/*
The match() method accepts two sequence strings (not objects) of equal length,
and returns a sequence match result string, according to the following rules:
If there is an exact match, return the amino acid symbol.
If there is a partial match, return a plus sign.
If there is no match, return a whitespace character.
*/
function match($str1, $str2, $matrix, $equal, $partial = \"+\", $nomatch = \".\")
{
global $chemgrp_matrix;
// if the user chose not to use a custom submatrix, use the default one.
if (isset($matrix) == FALSE) $matrix = $chemgrp_matrix->rules;
// if the strings differ in length, terminate code execution.
if (strlen($str1) != strlen($str2))
die(\"Cannot match sequences with unequal lengths\");
$resultstr = \"\";
$seqlength = strlen($str1);
// Match the two strings, character by character. Each call to compare_letter()
// function returns a \"result character\" which is appended to a \"result string\".
for($i = 0; $i < $seqlength; $i++)
{
$let1 = substr($str1, $i, 1);
$let2 = substr($str2, $i, 1);
$resultstr = $resultstr . compare_letter($let1, $let2, $matrix, $equal, $partial, $nomatch);
}
// Assign \"result string\" to the result property of the calling SeqMatch object.
$this->result = $resultstr;
// Return the result string. While this line and the line above seems redundant, their
// presense here actually permits programmers to write more compact code.
return $resultstr;
}
}
?>