Root Zanli
Home
Console
Upload
information
Create File
Create Folder
About
Tools
:
/
opt
/
gsutil
/
third_party
/
charset_normalizer
/
tests
/
Filename :
test_coherence_detection.py
back
Copy
import pytest from charset_normalizer.cd import encoding_languages, mb_encoding_languages, is_multi_byte_encoding, get_target_features, filter_alt_coherence_matches @pytest.mark.parametrize( "iana_encoding, expected_languages", [ ("cp864", ["Arabic", "Farsi"]), ("cp862", ["Hebrew"]), ("cp737", ["Greek"]), ("cp424", ["Hebrew"]), ("cp273", ["Latin Based"]), ("johab", ["Korean"]), ("shift_jis", ["Japanese"]), ("mac_greek", ["Greek"]), ("iso2022_jp", ["Japanese"]) ] ) def test_infer_language_from_cp(iana_encoding, expected_languages): languages = mb_encoding_languages(iana_encoding) if is_multi_byte_encoding(iana_encoding) else encoding_languages(iana_encoding) for expected_language in expected_languages: assert expected_language in languages, "Wrongly detected language for given code page" @pytest.mark.parametrize( "language, expected_have_accents, expected_pure_latin", [ ("English", False, True), ("French", True, True), ("Hebrew", False, False), ("Arabic", False, False), ("Vietnamese", True, True), ("Turkish", True, True) ] ) def test_target_features(language, expected_have_accents, expected_pure_latin): target_have_accents, target_pure_latin = get_target_features(language) assert target_have_accents is expected_have_accents assert target_pure_latin is expected_pure_latin @pytest.mark.parametrize( "matches, expected_return", [ ([("English", 0.88,), ("English—", 0.99)], [("English", 0.99)]), ([("English", 0.88,), ("English—", 0.99), ("English——", 0.999)], [("English", 0.999)]), ([("English", 0.88,), ("English—", 0.77)], [("English", 0.88)]), ([("English", 0.88,), ("Italian", 0.77)], [("English", 0.88), ("Italian", 0.77)]), ] ) def test_filter_alt_coherence_matches(matches, expected_return): results = filter_alt_coherence_matches(matches) assert results == expected_return