Updated script for better splitting of synonyms

This commit is contained in:
Jamie Hardt
2024-10-30 10:51:05 -07:00
parent 485c5175b8
commit b21e80f708
17 changed files with 11401 additions and 27093 deletions
+341 -733
View File
File diff suppressed because it is too large Load Diff
+270 -595
View File
File diff suppressed because it is too large Load Diff
+211 -423
View File
File diff suppressed because it is too large Load Diff
+942 -2489
View File
File diff suppressed because it is too large Load Diff
+499 -1058
View File
File diff suppressed because it is too large Load Diff
+1117 -2972
View File
File diff suppressed because it is too large Load Diff
+942 -2494
View File
File diff suppressed because it is too large Load Diff
+418 -882
View File
File diff suppressed because it is too large Load Diff
+280 -598
View File
File diff suppressed because it is too large Load Diff
+943 -2081
View File
File diff suppressed because it is too large Load Diff
+1018 -2732
View File
File diff suppressed because it is too large Load Diff
+1024 -2760
View File
File diff suppressed because it is too large Load Diff
+946 -2050
View File
File diff suppressed because it is too large Load Diff
+423 -926
View File
File diff suppressed because it is too large Load Diff
+1127 -2359
View File
File diff suppressed because it is too large Load Diff
+896 -1940
View File
File diff suppressed because it is too large Load Diff
+4 -1
View File
@@ -80,7 +80,10 @@ for lang in langs:
syns_raw = category['Synonyms'] syns_raw = category['Synonyms']
assert type(syns_raw) == str, \ assert type(syns_raw) == str, \
f"Synonym list (lang: {lang}, {category['CatID']}) was not readable" f"Synonym list (lang: {lang}, {category['CatID']}) was not readable"
syn_list = re.split(r'\W+', syns_raw) split_pattern = r',\s*'
if lang in ['zh','ar','kr','ja','tw']:
split_pattern = r'\W+'
syn_list = re.split(split_pattern, syns_raw)
category['Synonyms'] = [s.lower() for s in syn_list] category['Synonyms'] = [s.lower() for s in syn_list]
schedule.append(category) schedule.append(category)