Skip to content

Commit 6e0261d

Browse files
committed
added function to find the maximum number of processors to use
1 parent 4b0821a commit 6e0261d

1 file changed

Lines changed: 38 additions & 3 deletions

File tree

ac_dc/main_filtering.py

Lines changed: 38 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,47 @@
11
"""Filtering."""
22

3+
from multiprocessing import cpu_count
4+
35
import argparse
46

57
from datasets import load_dataset
68

79
from filtering import DatasetFiltering
810

911

12+
def check_num_proc(num_proc: int = -1) -> int:
13+
"""
14+
Check the number of processors. Return a safe-checked value.
15+
16+
Parameters
17+
----------
18+
num_proc : int, optional
19+
Number of processors to use, by default -1
20+
21+
Returns
22+
-------
23+
int
24+
Number of processors to use
25+
26+
Raises
27+
------
28+
ValueError
29+
If the input exceeds the number of processors available
30+
"""
31+
maximum: int = cpu_count()
32+
if num_proc > maximum:
33+
raise ValueError(
34+
f"{num_proc} exceeds the maximum number ({maximum}) of processors"
35+
)
36+
37+
if num_proc == -1:
38+
num_proc = maximum
39+
else:
40+
print(f"Using {num_proc} processors out of {maximum} can be slow")
41+
42+
return num_proc
43+
44+
1045
def parseArgs():
1146
parser = argparse.ArgumentParser(description="Filtering.")
1247
parser.add_argument(
@@ -60,8 +95,8 @@ def parseArgs():
6095
parser.add_argument(
6196
"--num_proc",
6297
type=int,
63-
default=2,
64-
help="Number of processes for multiprocessing.",
98+
default=-1,
99+
help="Number of processes for multiprocessing. Default at the number of processors available.",
65100
)
66101
parser.add_argument(
67102
"--path_dir_save_dataset",
@@ -89,7 +124,7 @@ def main():
89124
path_fasttext_model=args.path_fasttext_model,
90125
path_sentencepiece_model=args.path_sentencepiece_model,
91126
path_kenlm_model=args.path_kenlm_model,
92-
num_proc=args.num_proc,
127+
num_proc=check_num_proc(args.num_proc),
93128
path_dir_save_dataset=args.path_dir_save_dataset,
94129
)
95130
dataset_filtering.modifying_documents()

0 commit comments

Comments
 (0)