Source code for dae.pheno.utils.commons

import re

# remove annoying characters
# https://stackoverflow.com/questions/6609895/efficiently-replace-bad-characters
ANNOYING_CHARACTERS = {
    "\xc2\x82": ",",  # High code comma
    "\xc2\x84": ",,",  # High code double comma
    "\xc2\x85": "...",  # Tripple dot
    "\xc2\x88": "^",  # High carat
    "\xc2\x91": "'",  # Forward single quote
    "\xc2\x92": "'",  # Reverse single quote
    "\xc2\x93": '"',  # Forward double quote
    "\xc2\x94": '"',  # Reverse double quote
    "\xc2\x95": " ",
    "\xc2\x96": "-",  # High hyphen
    "\xc2\x97": "--",  # Double hyphen
    "\xc2\x99": " ",
    "\xc2\xa0": " ",
    "\xc2\xa6": "|",  # Split vertical bar
    "\xc2\xab": "<<",  # Double less than
    "\xc2\xbb": ">>",  # Double greater than
    "\xc2\xbc": "1/4",  # one quarter
    "\xc2\xbd": "1/2",  # one half
    "\xc2\xbe": "3/4",  # three quarters
    "\xca\xbf": "\x27",  # c-single quote
    "\xcc\xa8": "",  # modifier - under curve
    "\xcc\xb1": "",  # modifier - under line
    "\xe2\x89\xa4": "<=",
    "\xe2\x89\xa5": ">=",
    "\xbd": " 1/2",
    "\x91": "'",
    "\x92": "'",
    "\x93": '"',
    "\x94": '"',
    "\x95": " ",
    "\x96": "-",  # High hyphen
    "\x97": "--",  # Double hyphen
    "\x99": " ",
}


[docs]def remove_annoying_characters(text: str) -> str: def replace_chars(match: re.Match) -> str: char = match.group(0) return ANNOYING_CHARACTERS[char] return re.sub( "(" + "|".join(list(ANNOYING_CHARACTERS.keys())) + ")", replace_chars, text, )