|
1 | 1 | """Filtering.""" |
2 | 2 |
|
| 3 | +from multiprocessing import cpu_count |
| 4 | + |
3 | 5 | import argparse |
4 | 6 |
|
5 | 7 | from datasets import load_dataset |
6 | 8 |
|
7 | 9 | from filtering import DatasetFiltering |
8 | 10 |
|
9 | 11 |
|
| 12 | +def check_num_proc(num_proc: int = -1) -> int: |
| 13 | + """ |
| 14 | + Check the number of processors. Return a safe-checked value. |
| 15 | +
|
| 16 | + Parameters |
| 17 | + ---------- |
| 18 | + num_proc : int, optional |
| 19 | + Number of processors to use, by default -1 |
| 20 | +
|
| 21 | + Returns |
| 22 | + ------- |
| 23 | + int |
| 24 | + Number of processors to use |
| 25 | +
|
| 26 | + Raises |
| 27 | + ------ |
| 28 | + ValueError |
| 29 | + If the input exceeds the number of processors available |
| 30 | + """ |
| 31 | + maximum: int = cpu_count() |
| 32 | + if num_proc > maximum: |
| 33 | + raise ValueError( |
| 34 | + f"{num_proc} exceeds the maximum number ({maximum}) of processors" |
| 35 | + ) |
| 36 | + |
| 37 | + if num_proc == -1: |
| 38 | + num_proc = maximum |
| 39 | + else: |
| 40 | + print(f"Using {num_proc} processors out of {maximum} can be slow") |
| 41 | + |
| 42 | + return num_proc |
| 43 | + |
| 44 | + |
10 | 45 | def parseArgs(): |
11 | 46 | parser = argparse.ArgumentParser(description="Filtering.") |
12 | 47 | parser.add_argument( |
@@ -60,8 +95,8 @@ def parseArgs(): |
60 | 95 | parser.add_argument( |
61 | 96 | "--num_proc", |
62 | 97 | type=int, |
63 | | - default=2, |
64 | | - help="Number of processes for multiprocessing.", |
| 98 | + default=-1, |
| 99 | + help="Number of processes for multiprocessing. Default at the number of processors available.", |
65 | 100 | ) |
66 | 101 | parser.add_argument( |
67 | 102 | "--path_dir_save_dataset", |
@@ -89,7 +124,7 @@ def main(): |
89 | 124 | path_fasttext_model=args.path_fasttext_model, |
90 | 125 | path_sentencepiece_model=args.path_sentencepiece_model, |
91 | 126 | path_kenlm_model=args.path_kenlm_model, |
92 | | - num_proc=args.num_proc, |
| 127 | + num_proc=check_num_proc(args.num_proc), |
93 | 128 | path_dir_save_dataset=args.path_dir_save_dataset, |
94 | 129 | ) |
95 | 130 | dataset_filtering.modifying_documents() |
|
0 commit comments