bigscience-workshop
diff --git a/‎cc_pseudo_crawl/cc_lookup.py‎
Lines changed: 107 additions & 46 deletions b/‎cc_pseudo_crawl/cc_lookup.py‎
Lines changed: 107 additions & 46 deletions
diff --git a/‎cc_pseudo_crawl/requirements.txt‎
Lines changed: 1 addition & 1 deletion b/‎cc_pseudo_crawl/requirements.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cc_pseudo_crawl/sourcing_sheet_seeds/README.md‎
Lines changed: 1 addition & 1 deletion b/‎cc_pseudo_crawl/sourcing_sheet_seeds/README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cc_pseudo_crawl/sourcing_sheet_seeds/candidate_websites_for_crawling.csv‎
Lines changed: 3 additions & 3 deletions b/‎cc_pseudo_crawl/sourcing_sheet_seeds/candidate_websites_for_crawling.csv‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎pii-manager/CHANGES.md‎
Lines changed: 14 additions & 0 deletions b/‎pii-manager/CHANGES.md‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎pii-manager/Makefile‎
Lines changed: 3 additions & 3 deletions b/‎pii-manager/Makefile‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎pii-manager/doc/contributing.md‎
Lines changed: 94 additions & 7 deletions b/‎pii-manager/doc/contributing.md‎
Lines changed: 94 additions & 7 deletions
@@ -9,8 +9,9 @@
 
 from pyathena import connect
 
-logging.basicConfig(level='INFO',
-                    format='%(asctime)s %(levelname)s %(name)s: %(message)s')
+logging.basicConfig(
+    level="INFO", format="%(asctime)s %(levelname)s %(name)s: %(message)s"
+)
 
 join_template = """
 CREATE TABLE {db}._tmp_overlap
@@ -45,7 +46,7 @@
 WHERE cc.crawl = '{crawl}'
 """
 
-drop_tmp_table = 'DROP TABLE `{db}._tmp_overlap`;'
+drop_tmp_table = "DROP TABLE `{db}._tmp_overlap`;"
 
 # list of crawls
 # Note: in order to get a list of released crawls:
@@ -54,49 +55,103 @@
 # - see
 #    https://commoncrawl.s3.amazonaws.com/crawl-data/index.html
 crawls = [
-    'CC-MAIN-2013-20', 'CC-MAIN-2013-48',
+    "CC-MAIN-2013-20",
+    "CC-MAIN-2013-48",
     #
-    'CC-MAIN-2014-10', 'CC-MAIN-2014-15', 'CC-MAIN-2014-23',
-    'CC-MAIN-2014-35', 'CC-MAIN-2014-41', 'CC-MAIN-2014-42',
-    'CC-MAIN-2014-49', 'CC-MAIN-2014-52',
+    "CC-MAIN-2014-10",
+    "CC-MAIN-2014-15",
+    "CC-MAIN-2014-23",
+    "CC-MAIN-2014-35",
+    "CC-MAIN-2014-41",
+    "CC-MAIN-2014-42",
+    "CC-MAIN-2014-49",
+    "CC-MAIN-2014-52",
     #
-    'CC-MAIN-2015-06', 'CC-MAIN-2015-11', 'CC-MAIN-2015-14',
-    'CC-MAIN-2015-18', 'CC-MAIN-2015-22', 'CC-MAIN-2015-27',
-    'CC-MAIN-2015-32', 'CC-MAIN-2015-35', 'CC-MAIN-2015-40',
-    'CC-MAIN-2015-48',
+    "CC-MAIN-2015-06",
+    "CC-MAIN-2015-11",
+    "CC-MAIN-2015-14",
+    "CC-MAIN-2015-18",
+    "CC-MAIN-2015-22",
+    "CC-MAIN-2015-27",
+    "CC-MAIN-2015-32",
+    "CC-MAIN-2015-35",
+    "CC-MAIN-2015-40",
+    "CC-MAIN-2015-48",
     #
-    'CC-MAIN-2016-07', 'CC-MAIN-2016-18', 'CC-MAIN-2016-22',
-    'CC-MAIN-2016-26', 'CC-MAIN-2016-30', 'CC-MAIN-2016-36',
-    'CC-MAIN-2016-40', 'CC-MAIN-2016-44', 'CC-MAIN-2016-50',
+    "CC-MAIN-2016-07",
+    "CC-MAIN-2016-18",
+    "CC-MAIN-2016-22",
+    "CC-MAIN-2016-26",
+    "CC-MAIN-2016-30",
+    "CC-MAIN-2016-36",
+    "CC-MAIN-2016-40",
+    "CC-MAIN-2016-44",
+    "CC-MAIN-2016-50",
     #
-    'CC-MAIN-2017-04', 'CC-MAIN-2017-09', 'CC-MAIN-2017-13',
-    'CC-MAIN-2017-17', 'CC-MAIN-2017-22', 'CC-MAIN-2017-26',
-    'CC-MAIN-2017-30', 'CC-MAIN-2017-34', 'CC-MAIN-2017-39',
-    'CC-MAIN-2017-43', 'CC-MAIN-2017-47', 'CC-MAIN-2017-51',
+    "CC-MAIN-2017-04",
+    "CC-MAIN-2017-09",
+    "CC-MAIN-2017-13",
+    "CC-MAIN-2017-17",
+    "CC-MAIN-2017-22",
+    "CC-MAIN-2017-26",
+    "CC-MAIN-2017-30",
+    "CC-MAIN-2017-34",
+    "CC-MAIN-2017-39",
+    "CC-MAIN-2017-43",
+    "CC-MAIN-2017-47",
+    "CC-MAIN-2017-51",
     #
-    'CC-MAIN-2018-05', 'CC-MAIN-2018-09', 'CC-MAIN-2018-13',
-    'CC-MAIN-2018-17', 'CC-MAIN-2018-22', 'CC-MAIN-2018-26',
-    'CC-MAIN-2018-30', 'CC-MAIN-2018-34', 'CC-MAIN-2018-39',
-    'CC-MAIN-2018-43', 'CC-MAIN-2018-47', 'CC-MAIN-2018-51',
+    "CC-MAIN-2018-05",
+    "CC-MAIN-2018-09",
+    "CC-MAIN-2018-13",
+    "CC-MAIN-2018-17",
+    "CC-MAIN-2018-22",
+    "CC-MAIN-2018-26",
+    "CC-MAIN-2018-30",
+    "CC-MAIN-2018-34",
+    "CC-MAIN-2018-39",
+    "CC-MAIN-2018-43",
+    "CC-MAIN-2018-47",
+    "CC-MAIN-2018-51",
     #
-    'CC-MAIN-2019-04', 'CC-MAIN-2019-09', 'CC-MAIN-2019-13',
-    'CC-MAIN-2019-18', 'CC-MAIN-2019-22', 'CC-MAIN-2019-26',
-    'CC-MAIN-2019-30', 'CC-MAIN-2019-35', 'CC-MAIN-2019-39',
-    'CC-MAIN-2019-43', 'CC-MAIN-2019-47', 'CC-MAIN-2019-51',
+    "CC-MAIN-2019-04",
+    "CC-MAIN-2019-09",
+    "CC-MAIN-2019-13",
+    "CC-MAIN-2019-18",
+    "CC-MAIN-2019-22",
+    "CC-MAIN-2019-26",
+    "CC-MAIN-2019-30",
+    "CC-MAIN-2019-35",
+    "CC-MAIN-2019-39",
+    "CC-MAIN-2019-43",
+    "CC-MAIN-2019-47",
+    "CC-MAIN-2019-51",
     #
-    'CC-MAIN-2020-05', 'CC-MAIN-2020-10', 'CC-MAIN-2020-16',
-    'CC-MAIN-2020-24', 'CC-MAIN-2020-29', 'CC-MAIN-2020-34',
-    'CC-MAIN-2020-40', 'CC-MAIN-2020-45', 'CC-MAIN-2020-50',
+    "CC-MAIN-2020-05",
+    "CC-MAIN-2020-10",
+    "CC-MAIN-2020-16",
+    "CC-MAIN-2020-24",
+    "CC-MAIN-2020-29",
+    "CC-MAIN-2020-34",
+    "CC-MAIN-2020-40",
+    "CC-MAIN-2020-45",
+    "CC-MAIN-2020-50",
     #
-    'CC-MAIN-2021-04', 'CC-MAIN-2021-10', 'CC-MAIN-2021-17',
-    'CC-MAIN-2021-21', 'CC-MAIN-2021-25', 'CC-MAIN-2021-31',
-    'CC-MAIN-2021-39', 'CC-MAIN-2021-43', 'CC-MAIN-2021-49',
+    "CC-MAIN-2021-04",
+    "CC-MAIN-2021-10",
+    "CC-MAIN-2021-17",
+    "CC-MAIN-2021-21",
+    "CC-MAIN-2021-25",
+    "CC-MAIN-2021-31",
+    "CC-MAIN-2021-39",
+    "CC-MAIN-2021-43",
+    "CC-MAIN-2021-49",
     #
 ]
 
 
 s3_location = sys.argv[1]
-s3_location = s3_location.rstrip('/') # no trailing slash!
+s3_location = s3_location.rstrip("/")  # no trailing slash!
 
 seed_table = sys.argv[2]
 
@@ -106,23 +161,29 @@
 crawls = filter(lambda c: crawl_selector.match(c), crawls)
 
 
-cursor = connect(s3_staging_dir="{}/staging".format(s3_location),
-                 region_name="us-east-1").cursor()
+cursor = connect(
+    s3_staging_dir="{}/staging".format(s3_location), region_name="us-east-1"
+).cursor()
 
 for crawl in crawls:
-    query = join_template.format(crawl=crawl,
-                                 s3_location="{}/cc".format(s3_location),
-                                 db='bigscience',
-                                 seed_table=seed_table,
-                                 tid='bs')
+    query = join_template.format(
+        crawl=crawl,
+        s3_location="{}/cc".format(s3_location),
+        db="bigscience",
+        seed_table=seed_table,
+        tid="bs",
+    )
     logging.info("Athena query: %s", query)
 
     cursor.execute(query)
     logging.info("Athena query ID %s: %s", cursor.query_id, cursor.result_set.state)
-    logging.info("       data_scanned_in_bytes: %d", cursor.result_set.data_scanned_in_bytes)
-    logging.info("       total_execution_time_in_millis: %d", cursor.result_set.total_execution_time_in_millis)
-
-    cursor.execute(drop_tmp_table.format(db='bigscience'))
+    logging.info(
+        "       data_scanned_in_bytes: %d", cursor.result_set.data_scanned_in_bytes
+    )
+    logging.info(
+        "       total_execution_time_in_millis: %d",
+        cursor.result_set.total_execution_time_in_millis,
+    )
+
+    cursor.execute(drop_tmp_table.format(db="bigscience"))
     logging.info("Drop temporary table: %s", cursor.result_set.state)
-
-
 
@@ -1,3 +1,3 @@
 pyathena
 surt
-tldextract
+tldextract
@@ -8,4 +8,4 @@ Steps:
 
 2. do the lookups / table join, see [general instructions](../README.md) using the crawl selector `CC-MAIN-202[01]` to restrict the join for the last 2 years
 
-3. prepare [coverage metrics](./cc-metrics.ipynb)
+3. prepare [coverage metrics](./cc-metrics.ipynb)
@@ -30,9 +30,9 @@
 38,energia-imdea,https://www.energia.imdea.org/,unknown,"multiple releases",,es,spain,"general news","text (web)",manual,,unknown,unknown,energia-imdea,
 39,urgente24,https://www.urgente24.com/,unknown,"multiple releases",,es,argentina,"general news","text (web)",manual,,unknown,unknown,urgente24,
 40,"corte electoral",https://www.corteelectoral.gub.uy/,unknown,"multiple releases",,es,uruguay,"general news","text (web)",manual,,unknown,unknown,"corte electoral",
-41,"observatorio de la política china",https://politica-china.org,unknown,"multiple releases",,es,spain,"general news","text (web)",manual,,unknown,unknown,"1, According to the “Provisions of the Supreme People's Court on the People's Court's Publication of Judgment Documents online” (最高人民法院关于人民法院在互联网公布裁判文书的规定), the online publication of judgment documents should be based on the principle of openness, with non-publicity as an exception. Judicial documents involving national security, juvenile delinquency, divorce proceedings, support or guardianship of minor children, etc., shall not be made public. In 
-public judgment documents, information concerning personal privacy, 
-trade secrets, etc., other than the names of the parties, shall also be 
+41,"observatorio de la política china",https://politica-china.org,unknown,"multiple releases",,es,spain,"general news","text (web)",manual,,unknown,unknown,"1, According to the “Provisions of the Supreme People's Court on the People's Court's Publication of Judgment Documents online” (最高人民法院关于人民法院在互联网公布裁判文书的规定), the online publication of judgment documents should be based on the principle of openness, with non-publicity as an exception. Judicial documents involving national security, juvenile delinquency, divorce proceedings, support or guardianship of minor children, etc., shall not be made public. In
+public judgment documents, information concerning personal privacy,
+trade secrets, etc., other than the names of the parties, shall also be
 deleted from the document.",
 42,"icefi - instituto centroamericano de estudios fiscales",http://icefi.org/,unknown,"multiple releases",,es,gt,"general news","text (web)",manual,,unknown,unknown,"icefi - instituto centroamericano de estudios fiscales",
 43,"baja california sur - gobierno del estado",http://www.bcs.gob.mx/,unknown,"multiple releases",,es,mexico,"general news","text (web)",manual,,unknown,unknown,"baja california sur - gobierno del estado",
 
@@ -1,3 +1,17 @@
+v. 0.5.0
+ * new task list parsing code, adding a "full" format based on dicts, in
+   addition to the previous "simplified" format based on tuples
+ * refactored to allow more than one task for a given PII and country
+ * added the capability to add task descriptors programmatically
+ * added reading task descriptors from a JSON file
+ * context validation spec, for all three task implementation types
+ * TASK_ANY split into LANG_ANY & COUNTRY_ANY
+ * PII detectors for international phone numbers, for en-any & es-any
+ * PII detector for IP addresses, language independent
+ * PII detectors for GOV_ID
+    - lang pt, countries PT & BR
+    - lang es, country MX
+
 v. 0.4.0
  * PII GOV_ID task for es-ES and en-AU
  * PII EMAIL_ADDRESS task
 
@@ -78,11 +78,11 @@ $(VENV)/bin/pytest:
 
 # -----------------------------------------------------------------------
 
-upload-check:
+upload-check: $(PKGFILE)
 	twine check $(PKGFILE)
 
-upload-test:
+upload-test: $(PKGFILE)
 	twine upload --repository pypitest $(PKGFILE)
 
-upload:
+upload: $(PKGFILE)
 	twine upload $(PKGFILE)
@@ -5,7 +5,7 @@ repository with the following changes:
 
  1. If the task type is a new one, add an identifier for it in [PiiEnum]
  2. If it is for a language not yet covered, add a new language subfolder
-    undder the [lang] folder, using the [ISO 639-1] code for the language
+    under the [lang] folder, using the [ISO 639-1] code for the language
  3. Then
     * If it is a country-independent PII, it goes into the `any` subdir
       (create that directory if it is not present)
@@ -15,8 +15,8 @@ repository with the following changes:
     module (the name of the file is not relevant). The module must contain:
     * The task implementation, which can have any of three flavours (regex,
       function or class), see below
-    * The task descriptor, a list with the (compulsory) name `PII_TASKS` (see
-      below)
+    * The task descriptor, a list containing all defined tasks. The list
+      variable *must* be named `PII_TASKS` (see below)
  5. Finally, add a unit test to check the validity for the task code, in the
     proper place under [test/unit/lang]. There should be at least
      - a positive test: one valid PII that has to be detected. For the cases
@@ -35,25 +35,112 @@ including where to add the required documentation explaining the task.
 
 ## Task descriptor
 
-The task descriptor is a Python list that contains at least one tuple defining
-the entry points for this task (there might be more than one, if the file
-implements more than one PII).
+The task descriptor is a Python list that contains at least one element
+defining the entry points for this task (there might be more than one, if
+the file implements more than one PII).
 
 * The name of the list **must be** `PII_TASKS`
+* A task entry in the list can have two different shapes: simplified and full.
+  In a `PII_TASKS` list they can be combined freely.
 
-* Each defined task must be a 2- or 3-element tuple, with these elements:
+
+### Simplified description
+
+In a simplified description a task must be a 2- or 3-element tuple, with
+these elements:
    - the PII identifier for the task: a member of [PiiEnum]
    - the [task implementation]: the regex, function or class implementing the
      PII extractor
    - (only if the implementation is of regex type) a text description of the
      task (for documentation purposes)
 
 
+### Full description
+
+In a full description a task is a dictionary with these compulsory fields:
+ * `pii`: the PII identifier for the task: a member of [PiiEnum]
+ * `type`: the task type: `regex`, `callable` or `PiiTask`
+ * `task`: for regex tasks, a raw string (contianing the regex to be used);
+   for function tasks a callable and for PiiTask either a class or a string
+   with a full class name.
+
+And these optional fields
+ * `lang`: language this task is designed for (it can also be `LANG_ANY`). If
+   not present, the language will be determined from the folder structure the
+   task implementation is in
+ * `country`: country this task is designed for. If not present, the language
+   will be determined from the folder structure the task implementation is in,
+   if possible (else, a `None`value will be used, meaning the task is not
+   country-dependent)
+ * `name`: a name for the task. If not present, a name will be generated from
+   the `pii_name` class-level attribute (PiiTask) or from the class/function
+   name.
+   This is meant to provide a higher level of detail than the `PiiEnum`
+   generic name (e.g. for different types of Government ID). Class-type tasks
+   can use a dynamic name at runtime (detected PII might have different names),
+   while function and regexes will have a fixed name.
+ * `doc`: the documentation for the class. If not present, the docstring for
+   callable and class types will be used (for regex types, the task will have
+   no documentation)
+ * `kwargs`: a dictionary of additional arguments. For `PiiTask` task types,
+   they will be added as arguments to the class constructor; for `callable`
+   types they will be added to each call to the task function. It is ignored
+   for `regex` types.
+ * `context` and `context_width`: for context validation, see below.
+
+
+## Context validation
+
+A task of any of the three types of [task implementation] may also include an
+additional step for context validation. In this step, all detected PII are
+further validated by ensuring that a document chunk around the detected PII
+string (before and after) contains one of the specified text contexts.
+
+Note that task descriptors with context *must* be a full description, i.e. the
+dict version.
+
+Context validation can have three variants:
+ * `string`: each text context is a substring to be matched.
+ * `word`: each text context is also a substring to be matched, but matching
+   is ensured to work only on full words (the substring can contain more than
+   one word, but it will be matched only of sequences of full words).
+ * `regex`: each context is a regular expression (to be matched by the [regex]
+   Python package)
+
+Regardless of the variante, matching is always performed after normalizing
+the extracted document context chunk: normalize whitespace (replace all
+whitespace chunks by a single space) and lowercase the chunk.
+
+This validation acts as a filter after the task implementation produces its
+results, and it is automatically applied if the task descriptor includes
+context (for class-based tasks, it can be replaced with custom code by
+overriding the `__call__` method, defined in the parent class).
+
+Context is defined by a `context` field in the task descriptor. This field
+is a dictionary, with the following elements:
+ * `type` indicates the variant, and it can be `string`, `word` or `regex`.
+   If not present, `string` is assumed.
+ * `value` should be a list of strings, any of which validate a document
+   context. For `regex` mode the strings will be regex patterns.
+ * `width` is an integer, or a tuple of two integers, that define
+   the width (in characters) of the document chunk (before and after the PII
+   string) that define the document context. If not specified, a default is
+   used.
+
+As a shortcut, the `context` field can also contain a simple list of strings
+(or a single string). In this case, the context is defined as type `string`,
+with the default width, and the field contents define the value.
+
+An example of context validation can be seen in the [international phone
+number] task.
 
 [task implementation]: #task-implementation
 [PiiEnum]: ../src/pii_manager/piienum.py
 [tasks]: tasks.md
 [lang]: ../src/pii_manager/lang
 [test/unit/lang]: ../test/unit/lang
+[international phone number]: ../src/pii_manager/lang/en/any/international_phone_number.py
+
 [ISO 639-1]: https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
 [ISO 3166-1]: https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2
+[regex]: https://github.com/mrabarnett/mrab-regex
Original file line number	Diff line number	Diff line change
`@@ -8,4 +8,4 @@ Steps:`
`8`	`8`
`9`	`9`	2. do the lookups / table join, see [general instructions](../README.md) using the crawl selector `CC-MAIN-202[01]` to restrict the join for the last 2 years
`10`	`10`
`11`		`-3. prepare [coverage metrics](./cc-metrics.ipynb)`
	`11`	`+3. prepare [coverage metrics](./cc-metrics.ipynb)`