Skip to content

Commit 7cecdc6

Browse files
committed
modify name of download_kenlm_models to include sentencepiece
1 parent 27f489d commit 7cecdc6

3 files changed

Lines changed: 17 additions & 15 deletions

File tree

ac_dc/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ Less importantly, you can also check how the anonymization and the normalization
2121

2222
#### 2. Download everything you need
2323

24-
To run the filtering code, it is necessary to download the dataset on which the filtering will take place, but also the necessary models, which are the Fasttext model for language identification (download [here](https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin)) and the Sentencepiece and KenLM models for tokenization and calculation of perplexity scores (download with the file [download_kenlm_models.py](https://github.com/bigscience-workshop/data_tooling/blob/master/ac_dc/download_kenlm_models.py)).
24+
To run the filtering code, it is necessary to download the dataset on which the filtering will take place, but also the necessary models, which are the Fasttext model for language identification (download [here](https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin)) and the Sentencepiece and KenLM models for tokenization and calculation of perplexity scores (download with the file [download_sentencepiece_kenlm_models.py](https://github.com/bigscience-workshop/data_tooling/blob/master/ac_dc/download_sentencepiece_kenlm_models.py)).
2525

2626
#### 3. Choose the filtering parameters
2727

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
1-
"""Download kenlm models for supported languages (48) from Facebook.
1+
"""Download Sentencepiece and KenLM models for supported languages (48) from Facebook.
22
33
Usage:
4-
python download_kenlm_models.py --output_path /tmp/
4+
python download_sentencepiece_kenlm_models.py --output_path /tmp/
55
6-
All kenlm language models will be saved under /tmp.
6+
All Sentencepiece and KenLM language models will be saved under /tmp.
77
"""
88

99
import argparse
@@ -12,30 +12,35 @@
1212
from languages_id import langs_id
1313

1414

15-
def download_kenlm_models(output_path: str) -> None:
15+
def download_sentencepiece_kenlm_models(output_path: str) -> None:
16+
supported_sentencepiece_langs = langs_id["sentencepiece_id"].dropna().unique()
17+
for lang in supported_sentencepiece_langs:
18+
try:
19+
output_sentencepiece = subprocess.check_output(
20+
f"wget http://dl.fbaipublicfiles.com/cc_net/lm/{lang}.sp.model -P {output_path}",
21+
shell=True,
22+
)
23+
except:
24+
print(f"Warning: Download failed for Sentencepiece model for language {lang}.")
25+
1626
supported_kenlm_langs = langs_id["kenlm_id"].dropna().unique()
17-
# unsupported_kenlm_langs = langs_id.loc[~langs_id.oscar_id.isin(langs_id.kenlm_id)].oscar_id.dropna().unique()
1827
for lang in supported_kenlm_langs:
1928
try:
2029
output_kenlm = subprocess.check_output(
2130
f"wget http://dl.fbaipublicfiles.com/cc_net/lm/{lang}.arpa.bin -P {output_path}",
2231
shell=True,
2332
)
24-
output_sentencepiece = subprocess.check_output(
25-
f"wget http://dl.fbaipublicfiles.com/cc_net/lm/{lang}.sp.model -P {output_path}",
26-
shell=True,
27-
)
2833
except:
2934
print(f"Warning: Download failed for KenLM model for language {lang}.")
3035

3136

3237
if __name__ == "__main__":
3338
parser = argparse.ArgumentParser(
34-
description="Download kenlm models for supported languages."
39+
description="Download Sentencepiece and KenLM models for supported languages."
3540
)
3641
parser.add_argument(
3742
"--output_path", type=str, default="/tmp/", help="Output path to save models."
3843
)
3944
args = parser.parse_args()
4045

41-
download_kenlm_models(output_path=args.output_path)
46+
download_sentencepiece_kenlm_models(output_path=args.output_path)

ac_dc/filtering.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,6 @@
22

33
import fasttext
44

5-
# To download the fasttext model:
6-
# wget -O /tmp/lid.176.bin https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin
7-
85
import sentencepiece
96
import kenlm
107

0 commit comments

Comments
 (0)