WeChatMsg/resource/datasets/__init__.py
2023-05-22 11:32:12 +08:00

142 lines
4.4 KiB
Python

import difflib
import os
import typing
import urllib.request
import simplejson as json
class FuzzyDict(dict):
"""Provides a dictionary that performs fuzzy lookup"""
def __init__(self, cutoff: float = 0.6):
"""Construct a new FuzzyDict instance
items is an dictionary to copy items from (optional)
cutoff is the match ratio below which matches should not be considered
cutoff needs to be a float between 0 and 1 (where zero is no match
and 1 is a perfect match)"""
super(FuzzyDict, self).__init__()
self.cutoff = cutoff
# short wrapper around some super (dict) methods
self._dict_contains = lambda key: super(FuzzyDict, self).__contains__(key)
self._dict_getitem = lambda key: super(FuzzyDict, self).__getitem__(key)
def _search(self, lookfor: typing.Any, stop_on_first: bool = False):
"""Returns the value whose key best matches lookfor
if stop_on_first is True then the method returns as soon
as it finds the first item
"""
# if the item is in the dictionary then just return it
if self._dict_contains(lookfor):
return True, lookfor, self._dict_getitem(lookfor), 1
# set up the fuzzy matching tool
ratio_calc = difflib.SequenceMatcher()
ratio_calc.set_seq1(lookfor)
# test each key in the dictionary
best_ratio = 0
best_match = None
best_key = None
for key in self:
# if the current key is not a string
# then we just skip it
try:
# set up the SequenceMatcher with other text
ratio_calc.set_seq2(key)
except TypeError:
continue
# we get an error here if the item to look for is not a
# string - if it cannot be fuzzy matched and we are here
# this it is definitely not in the dictionary
try:
# calculate the match value
ratio = ratio_calc.ratio()
except TypeError:
break
# if this is the best ratio so far - save it and the value
if ratio > best_ratio:
best_ratio = ratio
best_key = key
best_match = self._dict_getitem(key)
if stop_on_first and ratio >= self.cutoff:
break
return best_ratio >= self.cutoff, best_key, best_match, best_ratio
def __contains__(self, item: typing.Any):
if self._search(item, True)[0]:
return True
else:
return False
def __getitem__(self, lookfor: typing.Any):
matched, key, item, ratio = self._search(lookfor)
if not matched:
raise KeyError(
"'%s'. closest match: '%s' with ratio %.3f"
% (str(lookfor), str(key), ratio)
)
return item
__HERE = os.path.abspath(os.path.dirname(__file__))
with open(os.path.join(__HERE, "map_filename.json"), "r", encoding="utf8") as f:
FILENAMES: FuzzyDict = FuzzyDict()
for k, v in json.load(f).items():
FILENAMES[k] = v
with open(os.path.join(__HERE, "city_coordinates.json"), "r", encoding="utf8") as f:
COORDINATES: FuzzyDict = FuzzyDict()
for k, v in json.load(f).items():
COORDINATES[k] = v
EXTRA = {}
def register_url(asset_url: str):
if asset_url:
registry = asset_url + "/registry.json"
try:
contents = urllib.request.urlopen(registry).read()
contents = json.loads(contents)
except Exception as e:
raise e
files = {}
pinyin_names = set()
for name, pinyin in contents["PINYIN_MAP"].items():
file_name = contents["FILE_MAP"][pinyin]
files[name] = [file_name, "js"]
pinyin_names.add(pinyin)
for key, file_name in contents["FILE_MAP"].items():
if key not in pinyin_names:
# English names
files[key] = [file_name, "js"]
js_folder_name = contents["JS_FOLDER"]
if js_folder_name == "/":
js_file_prefix = f"{asset_url}/"
else:
js_file_prefix = f"{asset_url}/{js_folder_name}/"
EXTRA[js_file_prefix] = files
def register_files(asset_files: dict):
if asset_files:
FILENAMES.update(asset_files)
def register_coords(coords: dict):
if coords:
COORDINATES.update(coords)