english version "1.0" identify "@(#)differ.sts 1.2 96/02/25" #: Copyright (c) 1995, 2002 by Wayne C. Gramlich. #, All rights reserved. #, #, Permission to use, copy, modify, distribute, and sell this software #, for any purpose is hereby granted without fee provided that the above #, copyright notice and this permission are retained. The author makes #, no representations about the suitability of this software for any purpose. #, It is provided "as is" without express or implied warranty. module differ #: This module implements algorithms for determining the differences #, between sequences of objects. This code is parameterized so that #, it can work on any arbitrary sequence of objects (e.g. vector[item], #, string, etc.) import address character format in_stream integer logical out_stream set string system table unsigned vector define align_type #: Alignment type enumeration differ #: Alignment is different same #: Alignemnt is the same smashed #: Alignment is smashed generate equal, print define align[item] #: One set of alignments record differ_table differ_table[item] #: Parent {differ_table} sequence1 sequence[item] #: First sequence sequence2 sequence[item] #: Second sequence type align_type #: Alignment type generate address_get, allocate, erase, identical define aligns[item] #: Sequence of {align} objects record differ_table differ_table[item] #: Parent {differ_table} list vector[align[item]] #: List of {align} objects generate address_get, allocate, erase, identical define differ_table[item] #: Some tables for computing diffs record debug_stream out_stream #: Debugging stream aligns aligns[item] #: Temp. align list aligns_free aligns[item] #: Free diff. list matches vector[match[item]] #: Temp. match list matches_free vector[match[item]] #: Free match list match_table table[item, match[item]] #: Match table for unique lines table set[sequence[item]] #: Table for lookups sequences vector[sequence[item]] #: Temp. sequences list sequences_free vector[sequence[item]] #: Free sequences list generate allocate, erase, identical, print define match[item] #: On set of matches record differ_table differ_table[item] #: Parent {differ_table} item item #: Matching item index1 unsigned #: First matching index index2 unsigned #: Second matching index generate address_get, allocate, erase, identical define sequence[item] #: A sequence of {item} objects record differ_table differ_table[item] #: Parent {differ_table} items vector[item] #: Object sequence is part of items_hash unsigned #: Hash value for {items} start unsigned #: Start offset of sequence length unsigned #: Number of objects in sequence generate address_get, allocate, erase, identical #: {align} procedures: procedure deallocate@align[item] takes align align[item] returns_nothing #: This procedure will deallocate {sequence}. align_deallocate@(align.differ_table, align) # Smash all of the fields: align.type := differ align.sequence1 := ?? align.sequence2 := ?? procedure greater_than@align[item] takes align1 align[item] align2 align[item] returns logical #: This procedure return {true} if {align1} is greater than {align2} #, with no overlap or {false} otherwise. return align1.sequence1 > align2.sequence1 && align1.sequence2 > align2.sequence2 procedure is_smashed@align[item] takes align align[item] returns logical #: This procedure returns {true} if {align} is "smashed" and {false} #, otherwise. return align.type = smashed procedure less_than@align[item] takes align1 align[item] align2 align[item] returns logical #: This procedure return {true} if {align1} is less than {align2} #, with no overlap or {false} otherwise. return align1.sequence1 < align2.sequence1 && align1.sequence2 < align2.sequence2 procedure print@align[item] takes align align[item] out_stream out_stream returns_nothing put@('[type: ', out_stream) print@(align.type, out_stream) put@(' sequence1: ', out_stream) print@(align.sequence1, out_stream) put@(', sequence2: ', out_stream) print@(align.sequence2, out_stream) put@("]", out_stream) procedure smash@align[item] takes align align[item] returns_nothing #: This procedure will mark {align} as an empty sequence. smash@(align.sequence1) smash@(align.sequence2) align.type := smashed #: {aligns} procedures: procedure append@aligns[item] takes aligns aligns[item] align align[item] returns_nothing #: This procedure will append {align} to {aligns}. append@(aligns.list, align) procedure create@aligns[item] takes differ_table differ_table[item] returns aligns[item] #: This procedure will create and return a new {aligns} object #, containing {differ_table}. initialize aligns:: aligns[item] := allocate@aligns[item]() aligns.differ_table := differ_table aligns.list := allocate@vector[align[item]]() return aligns procedure cull@aligns[item] takes aligns aligns[item] returns_nothing #: This procedure will remove any {align} objects from {aligns} #, that are "smashed". size :@= aligns.size from_index :@= 0 to_index :@= 0 loop while from_index < size align :@= aligns[from_index] if is_smashed@(align) deallocate@(align) else aligns[to_index] := align to_index :+= 1 from_index :+= 1 truncate@(aligns, to_index) procedure differ_insert@aligns[item] takes aligns aligns[item] items1 vector[item] items2 vector[item] returns_nothing #: This procedure will add all of the differing {align} objects #, {aligns}. differ_table :@= aligns.differ_table debug_stream :@= differ_table.debug_stream size :@= aligns.size if size = 0 size1 :@= items1.size size2 :@= items2.size if size1 != 0 || size2 != 0 #put@("aligns_differ 1a\n\", debug_stream) align :@= align_allocate@(differ_table, differ, items1, 0, size1, items2, 0, size2) #put@("aligns_differ 1b\n\", debug_stream) append@(aligns, align) else differences :@= size - 1 # Is there a front difference? has_front_difference:: logical := false align :@= aligns[0] if align.sequence1.start != 0 || align.sequence2.start != 0 has_front_difference := true differences :+= 1 # Is there a back difference? has_back_difference:: logical := false align :@= aligns[size - 1] sequence1 :@= align.sequence1 sequence2 :@= align.sequence2 assert items1 == sequence1.items assert items2 == sequence2.items if sequence1.start + sequence1.length != items1.size || sequence2.start + sequence2.length != items2.size has_back_difference := true differences :+= 1 # Allocate the needed new slots: to_index :@= size + differences loop while differences != 0 append@(aligns, ??) differences :-=1 # Fill in any differing back: if has_back_difference align :@= aligns[size - 1] sequence1 :@= align.sequence1 sequence2 :@= align.sequence2 assert items1 == sequence1.items assert items2 == sequence2.items start1 :@= sequence1.start + sequence1.length start2 :@= sequence2.start + sequence2.length length1 :@= items1.size - start1 length2 :@= items2.size - start2 #put@("aligns_differ 2a\n\", debug_stream) align := align_allocate@(differ_table, differ, items1, start1, length1, items2, start2, length2) #put@("aligns_differ 2b\n\", debug_stream) to_index :-= 1 aligns[to_index] := align # Fill in any intermediate stuff: from_index :@= size - 1 loop while from_index != 0 # Fill the next {same} {align}: align_high :@= aligns[from_index] to_index :-= 1 aligns[to_index] := align_high #put@("aligns_differ 3a\n\", debug_stream) # Create the next {differ} {align}: from_index :-= 1 align_low :@= aligns[from_index] sequence1 :@= align_low.sequence1 sequence2 :@= align_low.sequence2 assert items1 == sequence1.items assert items2 == sequence2.items #format@format4[unsigned, unsigned, unsigned, # unsigned](debug_stream, # "low: s1s: %d% s1l: %d% s2s:%d% s2l:%d%\n\", # sequence1.start, sequence1.length, # sequence2.start, sequence2.length) start1 :@= sequence1.start + sequence1.length start2 :@= sequence2.start + sequence2.length sequence1 := align_high.sequence1 sequence2 := align_high.sequence2 #format@format4[unsigned, unsigned, unsigned, # unsigned](debug_stream, # "high: s1s: %d% s1l: %d% s2s:%d% s2l:%d%\n\", # sequence1.start, sequence1.length, # sequence2.start, sequence2.length) assert items1 == sequence1.items assert items2 == sequence2.items length1 :@= sequence1.start - start1 length2 :@= sequence2.start - start2 #format@format4[unsigned, unsigned, unsigned, # unsigned](debug_stream, "s1:%d% l1:%d% s2:%d% l2:%d%\n\", # start1, length1, start2, length2) align := align_allocate@(differ_table, differ, items1, start1, length1, items2, start2, length2) #put@("aligns_differ 3b\n\", debug_stream) to_index :-= 1 aligns[to_index] := align # Copy the last {same} align into the final position: align :@= aligns[0] to_index :-= 1 aligns[to_index] := align # Fill in any differing front: if has_front_difference align :@= aligns[0] sequence1 :@= align.sequence1 sequence2 :@= align.sequence2 assert items1 == sequence1.items assert items2 == sequence2.items length1 :@= sequence1.start length2 :@= sequence2.start #put@("aligns_differ 4a\n\", debug_stream) align := align_allocate@(differ_table, differ, items1, 0, length1, items2, 0, length2) #put@("aligns_differ 4b\n\", debug_stream) to_index :-= 1 aligns[to_index] := align assert to_index = 0 procedure fetch1@aligns[item] takes aligns aligns[item] index unsigned returns align[item] #: This procedure will return the {index}'th {align} object from {aligns}. return aligns.list[index] procedure order@aligns[item] takes aligns aligns[item] start unsigned length unsigned returns_nothing #: This procedure will order {aligns} so that all {align} objects #, between {start} and {start} + {length} - 1 are ordered before #, or after the largest matching region. debug_stream :@= aligns.differ_table.debug_stream #format@format2[unsigned, unsigned](debug_stream, # "order@aligns(%d%, %d%)\n\", start, length) size :@= length if size > 1 # Step 1: Find the {align} with the largest length and put it at front: match :@= aligns[start] match_index :@= 0 match_length :@= match.sequence1.length index :@= 1 loop while index < size align :@= aligns[start + index] length :@= align.sequence1.length if length > match_length match :@= align match_index :@= index match_length := length index :+= 1 aligns[start + match_index] := aligns[start] aligns[start] := match # Step 2: Sweep aligns into culls:below:match:above order: #put@("match: ", debug_stream) #print@(match, debug_stream) #put@("\n\", debug_stream) cull_index :@= 0 # Next place to insert a "cull" below_index :@= 0 # Next place to insert a "below" {align} match_index := 1 # Next place to move {match} to above_index :@= 1 # Next place to insert an "above" {align} index := 1 loop assert 0 <= cull_index && cull_index <= below_index assert below_index < match_index && match_index <= above_index assert index = above_index while index < size align :@= aligns[start +index] if align > match # The align is "above". Just leave it where it is: #put@("above:", debug_stream) #print@(align, debug_stream) #put@("\n\", debug_stream) aligns[start + above_index] := align above_index :+= 1 else_if align < match # The align is "below". Move things up by one: #put@("below1:", debug_stream) #print@(align, debug_stream) #put@("\n\", debug_stream) aligns[start + above_index] := aligns[start + match_index] above_index :+= 1 aligns[start + match_index] := match match_index :+= 1 aligns[start + below_index] := align below_index :+= 1 else # Must be a cull: #put@("cull1:", debug_stream) #print@(align, debug_stream) #put@("\n\", debug_stream) smash@(align) aligns[start + above_index] := aligns[start + match_index] above_index :+= 1 aligns[start + match_index] := match match_index :+= 1 aligns[start + below_index] := aligns[start + cull_index] below_index :+= 1 aligns[start + cull_index] := align cull_index :+= 1 index :+= 1 assert above_index = size # Step 3: Divide and conquer: below_length :@= below_index - cull_index if below_length > 1 order@(aligns, start + cull_index, below_length) above_size :@= above_index - match_index if above_size > 1 order@(aligns, start + match_index, above_size) procedure pop@aligns[item] takes aligns aligns[item] returns align[item] #: This procedure will return the {align} object at the end of {aligns} #, and reduce the size of {aligns} by one. return pop@(aligns.list) procedure print@aligns[item] takes aligns aligns[item] out_stream out_stream returns_nothing #: This procedure will print {aligns} to {out_stream}. list :@= aligns.list size :@= list.size #format@format1[unsigned](out_stream, "size: %d%\n\", size) index :@= 0 loop while index < size align :@= list[index] format@format1[unsigned](out_stream, "%d%: ", index) print@(align, out_stream) put@("\n\", out_stream) index :+= 1 procedure size_get@aligns[item] takes aligns aligns[item] returns unsigned #: This procedure will return the size of {aligns}. return aligns.list.size procedure store1@aligns[item] takes aligns aligns[item] index unsigned align align[item] returns_nothing #: This procedure will store {align} into {aligns}. aligns.list[index] := align procedure truncate@aligns[item] takes aligns aligns[item] new_size unsigned returns_nothing #: This procedure will trucate {aligns} to only have {new_size} #, {align} objects. truncate@(aligns.list, new_size) procedure verify@aligns[item] takes aligns aligns[item] label string returns_nothing #: This procedure will verify that {aligns} is well ordered. debug_stream :@= aligns.differ_table.debug_stream errors:: logical := false size :@= aligns.size index :@= 1 loop while index < size align_low :@= aligns[index - 1] align_high :@= aligns[index] if align_low >= align_high format@format3[string, unsigned, unsigned](debug_stream, "%ds%: [%d%] !< [%d%]\n\", label, index - 1, index) errors := true index :+= 1 if errors print@(aligns, debug_stream) #: {differ_table} procedures: procedure align_allocate@differ_table[item] takes differ_table differ_table[item] type align_type items1 vector[item] start1 unsigned length1 unsigned items2 vector[item] start2 unsigned length2 unsigned returns align[item] #: This procedure will allocate and return a new {align} object #, containing {items1}, {start1}, {length2}, {items2}, #, {start2}, and {length2}. # debug_stream :@= differ_table.debug_stream # Allocate the {align} object: align:: align[item] := ?? aligns_free :@= differ_table.aligns_free if aligns_free.size = 0 align := allocate@align[item]() align.differ_table := differ_table else align :@= pop@(aligns_free) assert align.differ_table == differ_table # Fill in the fields: sequence1 :@= sequence_allocate@(differ_table, items1, start1, length1) sequence2 :@= sequence_allocate@(differ_table, items2, start2, length2) align.sequence1 := sequence1 align.sequence2 := sequence2 align.type := type return align procedure align_deallocate@differ_table[item] takes differ_table differ_table[item] align align[item] returns_nothing #: This procedure will deallocate {align} for {differ_table}. # Verify that we are not double deallocating: aligns_free :@= differ_table.aligns_free size :@= aligns_free.size index :@= 0 loop while index < size assert align !== aligns_free[index] index :+= 1 deallocate@(align.sequence1) deallocate@(align.sequence2) append@(aligns_free, align) procedure aligns_find@differ_table[item] takes differ_table differ_table[item] items1 vector[item] items2 vector[item] returns aligns[item] needs procedure equal@item takes item, item returns logical procedure hash@item takes item returns unsigned procedure identical@item takes item, item returns logical #: This procedure will return all of the alignments between #, {items1} and {items2} using {differ_table} to hold all of #, the intermediate state: debug_stream :@= differ_table.debug_stream aligns :@= aligns_match@(differ_table, items1, items2) order@(aligns, 0, aligns.size) cull@(aligns) verify@(aligns, "cull") differ_insert@(aligns, items1, items2) verify@(aligns, "differ_insert") return aligns procedure aligns_match@differ_table[item] takes differ_table differ_table[item] items1 vector[item] items2 vector[item] returns aligns[item] needs procedure equal@item takes item, item returns logical procedure hash@item takes item returns unsigned procedure identical@item takes item, item returns logical #: This procedure will identify and return the initial matching #, {align} objects for each matching span in {items1} and {itmes2} #, that contains a single uniquely matching {item}. # The basic algorithm occurs in two steps. The first step #, is to find items that occur exactly once in {items1} and #, {items2}. These items are high probability match ups. #, The second step is to use a heavier weight alogrithm to #, find matches in the remaining unmatched sections. debug_stream :@= differ_table.debug_stream # Throw out any {item}'s in {items1} that are not unique: match_table :@= differ_table.match_table assert match_table.size = 0 matches :@= differ_table.matches assert matches.size = 0 size1 :@= items1.size index1 :@= 0 loop while index1 < size1 item :@= items1[index1] assert item !== ?? match :@= lookup@(match_table, item) if match == ?? # This is the first time for this {item}: match :@= match_allocate@(differ_table) assert match !== ?? append@(matches, match) match.item := item match.index1 := index1 assert !(insert@(match_table, item, match)) else # We have a second match -- smash it: assert match.item !== ?? assert match.item = item smash@(match) index1 :+= 1 # Now iterate through {items2} looking for singleton matches in {items1}: size2 :@= items2.size index2 :@= 0 loop while index2 < size2 item :@= items2[index2] match :@= lookup@(match_table, item) if match !== ?? && match.index1 != 0xffffffff assert match.item !== ?? assert match.item = item if match.index2 = 0xffffffff # First match of item: match.index2 := index2 else # We have a second match -- smash it: smash@(match) index2 :+= 1 # Expand singleton matches: size :@= matches.size assert match_table.size = size aligns :@= differ_table.aligns truncate@(aligns, 0) index :@= 0 loop while index < size match :@= matches[index] assert match !== ?? item :@= match.item assert item !== ?? if match.index2 != 0xffffffff # We have two singleton matches: align :@= match_expand@(differ_table, match, items1, items2) append@(aligns, align) assert !delete@(match_table, item) deallocate@(match) index :+= 1 truncate@(matches, 0) assert match_table.size = 0 return aligns procedure create@differ_table[item] takes_nothing returns differ_table[item] #: This procedure will create and return a new {differ_table} object. initialize differ_table:: differ_table[item] := allocate@differ_table[item]() differ_table.aligns := create@aligns[item](differ_table) differ_table.aligns_free := create@aligns[item](differ_table) differ_table.debug_stream := ?? differ_table.matches := allocate@vector[match[item]]() differ_table.matches_free := allocate@vector[match[item]]() differ_table.match_table := xcreate@table[item, match[item]](100) differ_table.table := xcreate@set[sequence[item]](100) differ_table.sequences := allocate@vector[sequence[item]]() differ_table.sequences_free := allocate@vector[sequence[item]]() return differ_table procedure match_allocate@differ_table[item] takes differ_table differ_table[item] returns match[item] #: This procedure will allocate and return a {match} object from #, {differ_table}. matches_free :@= differ_table.matches_free match:: match[item] := ?? if matches_free.size = 0 match := allocate@match[item]() match.differ_table := differ_table else match := pop@(matches_free) assert match.differ_table == differ_table match.item := ?? smash@(match) debug_stream :@= differ_table.debug_stream return match procedure match_deallocate@differ_table[item] takes differ_table differ_table[item] match match[item] returns_nothing #: This procedure will deallocate {match} into {differ_table}. # Verify that there are no duplicate frees: matches_free :@= differ_table.matches_free size :@= matches_free.size index :@= 0 loop while index < size assert matches_free[index] !== match index :+= 1 append@(matches_free, match) procedure match_expand@differ_table[item] takes differ_table differ_table[item] match match[item] items1 vector[item] items2 vector[item] returns align[item] needs procedure equal@item takes item, item returns logical procedure hash@item takes item returns unsigned #: This procedure will expand {match} until is spans as many #, matching items as possible. debug_stream :@= differ_table.debug_stream # Pull out some data structures: match_table :@= differ_table.match_table size1 :@= items1.size size2 :@= items2.size start1 :@= match.index1 start2 :@= match.index2 length1 :@= 1 length2 :@= 1 # Look for following matches: index1 :@= start1 index2 :@= start2 loop index1 :+= 1 index2 :+= 1 while index1 < size1 && index2 < size2 item1 :@= items1[index1] item2 :@= items2[index2] while item1 = item2 length1 :+= 1 length2 :+= 1 match :@= lookup@(match_table, item1) if match !== ?? smash@(match) #: Look for previous matching items: loop while start1 != 0 && start2 != 0 item1 :@= items1[start1 - 1] item2 :@= items2[start2 - 1] while item1 = item2 start1 :-= 1 start2 :-= 1 length1 :+= 1 length2 :+= 1 match :@= lookup@(match_table, item1) if match !== ?? smash@(match) align :@= align_allocate@(differ_table, same, items1, start1, length1, items2, start2, length2) return align procedure sequence_allocate@differ_table[item] takes differ_table differ_table[item] items vector[item] start unsigned length unsigned returns sequence[item] #: This procedure will allocate and return an uninitialized #, {sequence} object from {differ_table}. debug_stream :@= differ_table.debug_stream assert length < 1000000 # Temporary for now! sequence:: sequence[item] := ?? sequences_free :@= differ_table.sequences_free if sequences_free.size = 0 sequence := allocate@sequence[item]() sequence.differ_table := differ_table else sequence := pop@(sequences_free) assert sequence.differ_table == differ_table sequence.items := items sequence.start := start sequence.length := length return sequence procedure sequence_deallocate@differ_table[item] takes differ_table differ_table[item] sequence sequence[item] returns_nothing #: This procedure will deallocate {sequence} into {differ_table}. # Verify that we are not double deallocating: sequences_free :@= differ_table.sequences size :@= sequences_free.size index :@= 0 loop while index < size assert sequence !== sequences_free[index] index :+= 1 append@(sequences_free, sequence) #: {match} procedures: procedure deallocate@match[item] takes match match[item] returns_nothing #: This procedure will deallocate {match}. match_deallocate@(match.differ_table, match) smash@(match) match.item := ?? procedure print@match[item] takes match match[item] out_stream out_stream returns_nothing assert false procedure smash@match[item] takes match match[item] returns_nothing #: This procedure will set the contents of {match} so that it #, no longer indicates a match. match.index1 := 0xffffffff match.index2 := 0xffffffff #: {sequence} procedures: procedure deallocate@sequence[item] takes sequence sequence[item] returns_nothing #: This procedure will deallocate {sequence}. sequence_deallocate@(sequence.differ_table, sequence) # Smash all of the fields: sequence.items := ?? sequence.start := 0xffffffff sequence.length := 0xffffffff procedure equal@sequence[item] takes sequence1 sequence[item] sequence2 sequence[item] returns logical needs procedure equal@item takes item, item returns logical #: This procedure will return {true} if {sequence1} equals {sequence2} #, and {false} otherwise. #if sequence1.items_hash != sequence2.items_hash # return false items1 :@= sequence1.items start1 :@= sequence1.start length1 :@= sequence1.length items2 :@= sequence2.items start2 :@= sequence2.start length2 :@= sequence2.length if length1 != length2 return false index :@= 0 loop while index < length1 if items1[start1 + index] != items2[start2 + index] return false index :+= 1 return true procedure greater_than@sequence[item] takes sequence1 sequence[item] sequence2 sequence[item] returns logical #: This procedure will return {true} if {sequence1} is greater than #, {sequence2} with no overlap; otherwise, {false} is returned. assert sequence1.items !== ?? assert sequence2.items !== ?? assert sequence1.items == sequence2.items return sequence1.start >= sequence2.start + sequence2.length procedure hash@sequence[item] takes sequence sequence[item] returns unsigned needs procedure hash@item takes item returns unsigned #: This procedure will return a hash value for {sequence}. items_hash :@= sequence.items_hash if items_hash = 0 items :@= sequence.items start :@= sequence.start length :@= sequence.length index :@= 0 loop while index < length item :@= items[start + index] items_hash :+= hash@(item) index :+= 1 #sequence.items_hash := items_hash return items_hash procedure less_than@sequence[item] takes sequence1 sequence[item] sequence2 sequence[item] returns logical #: This procedure will return {true} if {sequence1} is less than #, {sequence2} with no overlap; otherwise, {false} is returned. assert sequence1.items !== ?? assert sequence2.items !== ?? assert sequence1.items == sequence2.items return sequence1.start + sequence1.length <= sequence2.start procedure print@sequence[item] takes sequence sequence[item] out_stream out_stream returns_nothing needs procedure print@item takes item, out_stream returns_nothing #: This procedure outputs {sequence} to {out_stream}. items :@= sequence.items start :@= sequence.start length :@= sequence.length format@format3[unsigned, unsigned, address](out_stream, '[%d% %d% %X% ', start, length, items.address) index :@= 0 loop while index < length item :@= items[start + index] print@(item, out_stream) index :+= 1 put@("]", out_stream) procedure show_prefixed@sequence[item] takes sequence sequence[item] prefix string out_stream out_stream returns_nothing needs procedure show_prefixed@item takes item, string, out_stream returns_nothing #: This procedure will output {sequence} to {out_stream} with each #, {item} prefixed with {prefix}. items :@= sequence.items start :@= sequence.start length :@= sequence.length index :@= 0 loop while index < length item :@= items[start + index] show_prefixed@(item, prefix, out_stream) index :+= 1 procedure smash@sequence[item] takes sequence sequence[item] returns_nothing #: This procedure will convert {sequence} into a zero length sequence. sequence.start := 0xffffffff sequence.length := 0 sequence.items_hash := 0