-
-
Notifications
You must be signed in to change notification settings - Fork 46.9k
Added Burrows-Wheeler transform algorithm. #1029
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
16 commits
Select commit
Hold shift + click to select a range
401ed00
Added doctest and more explanation about Dijkstra execution.
brunohadlich d13eb2c
tests were not passing with python2 due to missing __init__.py file a…
brunohadlich 0e4e788
Removed the dot at the beginning of the imported modules names becaus…
brunohadlich 8c4c820
Moved global code to main scope and added doctest for project euler p…
brunohadlich 9fa20b3
Added test case for negative input.
brunohadlich 8b17220
Changed N variable to do not use end of line scape because in case th…
brunohadlich 6c11811
Added problems description and doctests to the ones that were missing…
brunohadlich cf4a9cf
Merge remote-tracking branch 'upstream/master'
brunohadlich b69db30
Changed the way files are loaded to support pytest call.
brunohadlich efa4bad
Added __init__.py to problems to make them modules and allow pytest e…
brunohadlich abfee10
Added project_euler folder to test units execution
brunohadlich 304c09c
Merge branch 'master' of https://github.com/TheAlgorithms/Python
brunohadlich aa87466
Changed 'os.path.split(os.path.realpath(__file__))' to 'os.path.dirna…
brunohadlich f0e1631
Merge branch 'master' of https://github.com/TheAlgorithms/Python
brunohadlich c08c4b5
Added Burrows-Wheeler transform algorithm.
brunohadlich cf3df56
Added changes suggested by cclauss
brunohadlich File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,176 @@ | ||
""" | ||
https://en.wikipedia.org/wiki/Burrows%E2%80%93Wheeler_transform | ||
|
||
The Burrows–Wheeler transform (BWT, also called block-sorting compression) | ||
rearranges a character string into runs of similar characters. This is useful | ||
for compression, since it tends to be easy to compress a string that has runs | ||
of repeated characters by techniques such as move-to-front transform and | ||
run-length encoding. More importantly, the transformation is reversible, | ||
without needing to store any additional data except the position of the first | ||
original character. The BWT is thus a "free" method of improving the efficiency | ||
of text compression algorithms, costing only some extra computation. | ||
""" | ||
from typing import List, Dict | ||
|
||
|
||
def all_rotations(s: str) -> List[str]: | ||
""" | ||
:param s: The string that will be rotated len(s) times. | ||
:return: A list with the rotations. | ||
:raises TypeError: If s is not an instance of str. | ||
Examples: | ||
|
||
>>> all_rotations("^BANANA|") # doctest: +NORMALIZE_WHITESPACE | ||
['^BANANA|', 'BANANA|^', 'ANANA|^B', 'NANA|^BA', 'ANA|^BAN', 'NA|^BANA', | ||
'A|^BANAN', '|^BANANA'] | ||
>>> all_rotations("a_asa_da_casa") # doctest: +NORMALIZE_WHITESPACE | ||
['a_asa_da_casa', '_asa_da_casaa', 'asa_da_casaa_', 'sa_da_casaa_a', | ||
'a_da_casaa_as', '_da_casaa_asa', 'da_casaa_asa_', 'a_casaa_asa_d', | ||
'_casaa_asa_da', 'casaa_asa_da_', 'asaa_asa_da_c', 'saa_asa_da_ca', | ||
'aa_asa_da_cas'] | ||
>>> all_rotations("panamabanana") # doctest: +NORMALIZE_WHITESPACE | ||
['panamabanana', 'anamabananap', 'namabananapa', 'amabananapan', | ||
'mabananapana', 'abananapanam', 'bananapanama', 'ananapanamab', | ||
'nanapanamaba', 'anapanamaban', 'napanamabana', 'apanamabanan'] | ||
>>> all_rotations(5) | ||
Traceback (most recent call last): | ||
... | ||
TypeError: The parameter s type must be str. | ||
""" | ||
if not isinstance(s, str): | ||
raise TypeError("The parameter s type must be str.") | ||
|
||
return [s[i:] + s[:i] for i in range(len(s))] | ||
|
||
|
||
def bwt_transform(s: str) -> Dict: | ||
""" | ||
:param s: The string that will be used at bwt algorithm | ||
:return: the string composed of the last char of each row of the ordered | ||
rotations and the index of the original string at ordered rotations list | ||
:raises TypeError: If the s parameter type is not str | ||
:raises ValueError: If the s parameter is empty | ||
Examples: | ||
|
||
>>> bwt_transform("^BANANA") | ||
{'bwt_string': 'BNN^AAA', 'idx_original_string': 6} | ||
>>> bwt_transform("a_asa_da_casa") | ||
{'bwt_string': 'aaaadss_c__aa', 'idx_original_string': 3} | ||
>>> bwt_transform("panamabanana") | ||
{'bwt_string': 'mnpbnnaaaaaa', 'idx_original_string': 11} | ||
>>> bwt_transform(4) | ||
Traceback (most recent call last): | ||
... | ||
TypeError: The parameter s type must be str. | ||
>>> bwt_transform('') | ||
Traceback (most recent call last): | ||
... | ||
ValueError: The parameter s must not be empty. | ||
""" | ||
if not isinstance(s, str): | ||
raise TypeError("The parameter s type must be str.") | ||
if not s: | ||
raise ValueError("The parameter s must not be empty.") | ||
|
||
rotations = all_rotations(s) | ||
rotations.sort() # sort the list of rotations in alphabetically order | ||
# make a string composed of the last char of each rotation | ||
return { | ||
"bwt_string": "".join([word[-1] for word in rotations]), | ||
"idx_original_string": rotations.index(s), | ||
} | ||
|
||
|
||
def reverse_bwt(bwt_string: str, idx_original_string: int) -> str: | ||
""" | ||
:param bwt_string: The string returned from bwt algorithm execution | ||
:param idx_original_string: A 0-based index of the string that was used to | ||
generate bwt_string at ordered rotations list | ||
:return: The string used to generate bwt_string when bwt was executed | ||
:raises TypeError: If the bwt_string parameter type is not str | ||
:raises ValueError: If the bwt_string parameter is empty | ||
:raises TypeError: If the idx_original_string type is not int or if not | ||
possible to cast it to int | ||
:raises ValueError: If the idx_original_string value is lower than 0 or | ||
greater than len(bwt_string) - 1 | ||
|
||
>>> reverse_bwt("BNN^AAA", 6) | ||
'^BANANA' | ||
>>> reverse_bwt("aaaadss_c__aa", 3) | ||
'a_asa_da_casa' | ||
>>> reverse_bwt("mnpbnnaaaaaa", 11) | ||
'panamabanana' | ||
>>> reverse_bwt(4, 11) | ||
Traceback (most recent call last): | ||
... | ||
TypeError: The parameter bwt_string type must be str. | ||
>>> reverse_bwt("", 11) | ||
Traceback (most recent call last): | ||
... | ||
ValueError: The parameter bwt_string must not be empty. | ||
>>> reverse_bwt("mnpbnnaaaaaa", "asd") # doctest: +NORMALIZE_WHITESPACE | ||
Traceback (most recent call last): | ||
... | ||
TypeError: The parameter idx_original_string type must be int or passive | ||
of cast to int. | ||
>>> reverse_bwt("mnpbnnaaaaaa", -1) | ||
Traceback (most recent call last): | ||
... | ||
ValueError: The parameter idx_original_string must not be lower than 0. | ||
>>> reverse_bwt("mnpbnnaaaaaa", 12) # doctest: +NORMALIZE_WHITESPACE | ||
Traceback (most recent call last): | ||
... | ||
ValueError: The parameter idx_original_string must be lower than | ||
len(bwt_string). | ||
>>> reverse_bwt("mnpbnnaaaaaa", 11.0) | ||
'panamabanana' | ||
>>> reverse_bwt("mnpbnnaaaaaa", 11.4) | ||
'panamabanana' | ||
""" | ||
if not isinstance(bwt_string, str): | ||
raise TypeError("The parameter bwt_string type must be str.") | ||
if not bwt_string: | ||
raise ValueError("The parameter bwt_string must not be empty.") | ||
try: | ||
idx_original_string = int(idx_original_string) | ||
except ValueError: | ||
raise TypeError( | ||
( | ||
"The parameter idx_original_string type must be int or passive" | ||
" of cast to int." | ||
) | ||
) | ||
if idx_original_string < 0: | ||
raise ValueError( | ||
"The parameter idx_original_string must not be lower than 0." | ||
) | ||
if idx_original_string >= len(bwt_string): | ||
raise ValueError( | ||
( | ||
"The parameter idx_original_string must be lower than" | ||
" len(bwt_string)." | ||
) | ||
) | ||
|
||
ordered_rotations = [""] * len(bwt_string) | ||
for x in range(len(bwt_string)): | ||
for i in range(len(bwt_string)): | ||
ordered_rotations[i] = bwt_string[i] + ordered_rotations[i] | ||
ordered_rotations.sort() | ||
return ordered_rotations[idx_original_string] | ||
|
||
|
||
if __name__ == "__main__": | ||
entry_msg = "Provide a string that I will generate its BWT transform: " | ||
s = input(entry_msg).strip() | ||
result = bwt_transform(s) | ||
bwt_output_msg = "Burrows Wheeler tranform for string '{}' results in '{}'" | ||
print(bwt_output_msg.format(s, result["bwt_string"])) | ||
original_string = reverse_bwt( | ||
result["bwt_string"], result["idx_original_string"] | ||
) | ||
fmt = ( | ||
"Reversing Burrows Wheeler tranform for entry '{}' we get original" | ||
" string '{}'" | ||
) | ||
print(fmt.format(result["bwt_string"], original_string)) |
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.