http://en.wikipedia.org/wiki/Levenshtein_distance
This alias is used for comparing the difference between two strings.
/*
* This is a rewritten version of codemastr's Levenshtein Distance alias. This version fixes a few errors
* and runs faster.
* In addition to rewriting parts, I've also included more details in the comments.
* Additional information about the function of the alias is available with the original version:
* http://www.mircscripts.org/showdoc.php?type=code&id=2127
* http://www.mircscripts.org/comments.php?cid=2127
*
* Syntax 1: $levenshtein(string1, string2)
* Syntax 2: $levenshtein(string1, string2, insertCost, replaceCost, deleteCost)
* $editdistance() is the same function.
* Case-sensitive versions are $levenshteincs()/$editdistancecs()
*
* Modified by Yawhatnever (Travis) - irc.swiftirc.net #mSL
* Free to use in any script, just attribute the sources above :)
*/
alias editdistance return $levenshteininternal($1,$2,$3,$4,$5,$false)
alias editdistancecs return $levenshteininternal($1,$2,$3,$4,$5,$true)
alias levenshtein return $levenshteininternal($1,$2,$3,$4,$5,$false)
alias levenshteincs return $levenshteininternal($1,$2,$3,$4,$5,$true)
alias -l levenshteininternal {
var %x = $len($1), %y = $len($2), %matrixsize.y = $calc(%y + 1)
if ($5 isnum) var %ins_cost = $3, %rep_cost = $4, %del_cost = $5
else var %ins_cost = 1, %rep_cost = 1, %del_cost = 1
if (!%x) return $calc(%y * %ins_cost)
if (!%y) return $calc(%x * %ins_cost)
hmake lvmatrix
set -u %matrixsize.x $calc(%x + 1)
;fill bottom row with insert cost
var %i, %c = 1, %cost = %ins_cost
while (%c < %matrixsize.x) {
matrixset %c 0 %cost
inc %c
inc %cost %ins_cost
}
;fill left column with delete cost
var %c = 0, %cost = 0
while (%c < %matrixsize.y) {
matrixset 0 %c %cost
inc %c
inc %cost %del_cost
}
%c = 1
while (%c <= %x) {
%i = 1
while (%i <= %y) {
if ($levenshteinequal($mid($1, %c, 1), $mid($2, %i, 1), $6)) %cost = 0
else %cost = %rep_cost
matrixset %c %i $levenshteinmin(%c, %i, %ins_cost, %del_cost, %cost)
inc %i
}
inc %c
}
var %return $matrixget(%x, %y)
;The following line is used for debug purposes.
;var %c %y | while (%c >= 0) { echo -sg $regsubex($left($str(@-,%matrixsize.x),-1),/@/g,$base($matrixget($calc(\n - 1),%c),10,10,2)) | dec %c }
:error
hfree lvmatrix
return %return
}
alias -l levenshteinmin {
/*
* compare(str1, str2)
* the value at point ($len(str1), $len(str2)) on the grid will be the levenshtein/edit distance
* use $matrixget(x, y) to get the value at point (x, y)
*
*{y}
* 2|4|3|2|1|1|
* r|3|2|1|0|1|
* t|2|1|0|1|2|
* s|1|0|1|2|3|
* |0|1|2|3|4|
* |s|t|r|1|{x}
*/
; bottom row is insert cost, left column is delete cost
; $levenshteinmin(x,y,ins_cost,del_cost,rep_cost)
var %left = $calc($matrixget($calc($1 - 1), $2) + $3)
var %below = $calc($matrixget($1, $calc($2 - 1)) + $4)
var %diag = $calc($matrixget($calc($1 - 1), $calc($2 - 1)) + $5)
return $gettok($sorttok(%left %below %diag, 32, n), 1, 32)
}
alias -l matrixset hadd lvmatrix $calc(%matrixsize.x * $2 + $1) $3
alias -l matrixget {
/*
* $matrixget(x coord, y coord)
* returns the value stored at point (x, y)
* bottom left is (0, 0)
* |6|7|8|
* |3|4|5|
* |0|1|2|
* e.g. value of point(2, 2) is stored in the hash table with key "8"
*/
return $hget(lvmatrix, $calc(%matrixsize.x * $2 + $1))
}
alias -l levenshteinequal {
;character 1, character 2, $true = case sensitive/$false = insensitive
if ($1 != $2) return $false
elseif ($1 === $2) return $true
elseif (!$3) return $true
}
Reminds me of an algorithm we implemented in a programming project over the spring semester. It could take in two files, and based on what words were used and how frequent they appeared, it would give an approximation on the likelihood that the authors were the same. Used more often than not to find plagiarized texts.
You could do exactly that using this algorithm. I think the best way might be to read a file into a binary variable, and then instead of passing $mid(word, %i, 1) to $levenshteinequal() you would loop through with $bfind() and pass the words.
Even in its current state it could compare the text of two files (assuming they were within the line length limit), although it would be comparing the characters rather than the words.
There seems to be a problem when I edit with some variables (mainly %del_cost and %below) being url decoded. If there are any problems there's a raw version here: http://pastebin.com/raw.php?i=Pd5624y2
Yeah. Some practical usage scenarios could be phishing detection or fuzzy string searching. For example: $levenshtein(nickserv, nikserv) which returns 1, or $levenshtein(runescaepee, runescape) which returns 2. I did some work on a bk-tree lookup [ http://nullwords.wordpress.com/2013/03/13/the-bk-tree-a-data-structure-for-spell-checking/ ] snippet for fuzzy searching which uses $levenshtein(), but it has a few issues and I haven't taken the time to sort them out.