|
1 | | -"""Download kenlm models for supported languages (48) from Facebook. |
| 1 | +"""Download Sentencepiece and KenLM models for supported languages (48) from Facebook. |
2 | 2 |
|
3 | 3 | Usage: |
4 | | - python download_kenlm_models.py --output_path /tmp/ |
| 4 | + python download_sentencepiece_kenlm_models.py --output_path /tmp/ |
5 | 5 |
|
6 | | -All kenlm language models will be saved under /tmp. |
| 6 | +All Sentencepiece and KenLM language models will be saved under /tmp. |
7 | 7 | """ |
8 | 8 |
|
9 | 9 | import argparse |
|
12 | 12 | from languages_id import langs_id |
13 | 13 |
|
14 | 14 |
|
15 | | -def download_kenlm_models(output_path: str) -> None: |
| 15 | +def download_sentencepiece_kenlm_models(output_path: str) -> None: |
| 16 | + supported_sentencepiece_langs = langs_id["sentencepiece_id"].dropna().unique() |
| 17 | + for lang in supported_sentencepiece_langs: |
| 18 | + try: |
| 19 | + output_sentencepiece = subprocess.check_output( |
| 20 | + f"wget http://dl.fbaipublicfiles.com/cc_net/lm/{lang}.sp.model -P {output_path}", |
| 21 | + shell=True, |
| 22 | + ) |
| 23 | + except: |
| 24 | + print(f"Warning: Download failed for Sentencepiece model for language {lang}.") |
| 25 | + |
16 | 26 | supported_kenlm_langs = langs_id["kenlm_id"].dropna().unique() |
17 | | - # unsupported_kenlm_langs = langs_id.loc[~langs_id.oscar_id.isin(langs_id.kenlm_id)].oscar_id.dropna().unique() |
18 | 27 | for lang in supported_kenlm_langs: |
19 | 28 | try: |
20 | 29 | output_kenlm = subprocess.check_output( |
21 | 30 | f"wget http://dl.fbaipublicfiles.com/cc_net/lm/{lang}.arpa.bin -P {output_path}", |
22 | 31 | shell=True, |
23 | 32 | ) |
24 | | - output_sentencepiece = subprocess.check_output( |
25 | | - f"wget http://dl.fbaipublicfiles.com/cc_net/lm/{lang}.sp.model -P {output_path}", |
26 | | - shell=True, |
27 | | - ) |
28 | 33 | except: |
29 | 34 | print(f"Warning: Download failed for KenLM model for language {lang}.") |
30 | 35 |
|
31 | 36 |
|
32 | 37 | if __name__ == "__main__": |
33 | 38 | parser = argparse.ArgumentParser( |
34 | | - description="Download kenlm models for supported languages." |
| 39 | + description="Download Sentencepiece and KenLM models for supported languages." |
35 | 40 | ) |
36 | 41 | parser.add_argument( |
37 | 42 | "--output_path", type=str, default="/tmp/", help="Output path to save models." |
38 | 43 | ) |
39 | 44 | args = parser.parse_args() |
40 | 45 |
|
41 | | - download_kenlm_models(output_path=args.output_path) |
| 46 | + download_sentencepiece_kenlm_models(output_path=args.output_path) |
0 commit comments