Updated script for better splitting of synonyms
This commit is contained in:
+340
-732
File diff suppressed because it is too large
Load Diff
+270
-595
File diff suppressed because it is too large
Load Diff
+211
-423
File diff suppressed because it is too large
Load Diff
+941
-2488
File diff suppressed because it is too large
Load Diff
+498
-1057
File diff suppressed because it is too large
Load Diff
+1117
-2972
File diff suppressed because it is too large
Load Diff
+942
-2494
File diff suppressed because it is too large
Load Diff
+418
-882
File diff suppressed because it is too large
Load Diff
+276
-594
File diff suppressed because it is too large
Load Diff
+942
-2080
File diff suppressed because it is too large
Load Diff
+1017
-2731
File diff suppressed because it is too large
Load Diff
+1023
-2759
File diff suppressed because it is too large
Load Diff
+946
-2050
File diff suppressed because it is too large
Load Diff
+423
-926
File diff suppressed because it is too large
Load Diff
+1127
-2359
File diff suppressed because it is too large
Load Diff
+896
-1940
File diff suppressed because it is too large
Load Diff
@@ -80,7 +80,10 @@ for lang in langs:
|
||||
syns_raw = category['Synonyms']
|
||||
assert type(syns_raw) == str, \
|
||||
f"Synonym list (lang: {lang}, {category['CatID']}) was not readable"
|
||||
syn_list = re.split(r'\W+', syns_raw)
|
||||
split_pattern = r',\s*'
|
||||
if lang in ['zh','ar','kr','ja','tw']:
|
||||
split_pattern = r'\W+'
|
||||
syn_list = re.split(split_pattern, syns_raw)
|
||||
category['Synonyms'] = [s.lower() for s in syn_list]
|
||||
|
||||
schedule.append(category)
|
||||
|
||||
Reference in New Issue
Block a user