33# Background tutorial on async programming with Python
44# https://realpython.com/async-io-python/
55
6- # Requires Python 3.7 or newer. Tested with 3.8 and 3.9.
6+ # Requires Python 3.7 or newer. Tested with 3.8/ 3.9/3.10/3.11 .
77
88# Installation:
99# pip3 install --upgrade opencage asyncio aiohttp backoff tqdm
1010
11+ # Example (forward) input file:
12+ # 1,"Via Allende 8, Cascina, Toscana, Italia"
13+ # 2,"Via Coppi, 17, Formigine, Emilia-Romagna, Italia"
14+ # 3,"Via Dei Salici 20, Gallarate, Lombardia, Italia"
15+ # 4,"Via Vittorio Veneto N7, San Giuliano Terme, Toscana, Italia"
16+ # 5,"Via Tiro A Segno 8, Gallarate, Lombardia, Italia"
17+
18+ # Example (reverse) input file:
19+ # 1,"43.6783472,10.5533173"
20+ # 2,"44.5655041,10.8412106"
21+ # 3,"45.6823942,8.7919808"
22+ # 4,"43.7804922,10.402925"
23+ # 5,"45.6506236,8.8037173"
24+ # or
25+ # 1,43.6783472,10.5533173
26+ # 2,44.5655041,10.8412106
27+ # 3,45.6823942,8.7919808
28+ # 4,43.7804922,10.402925
29+ # 5,45.6506236,8.8037173
30+
31+ import os
1132import sys
1233import csv
34+ import re
1335import asyncio
1436import traceback
1537import backoff
1638from tqdm import tqdm
1739from opencage .geocoder import OpenCageGeocode , AioHttpError
1840
41+
42+
43+
44+
45+
1946API_KEY = ''
2047FILENAME_INPUT_CSV = 'file_to_geocode.csv'
2148FILENAME_OUTPUT_CSV = 'file_geocoded.csv'
49+ FORWARD_OR_REVERSE = 'guess' # 'forward' (address -> coordinates) or 'reverse' (coordinates -> address)
50+ # With 'guess' the script checks if the address is two numbers and then
51+ # assumes reverse
2252
23- MAX_ITEMS = 100 # Howy man lines to read from the input file. Set to 0 for unlimited
53+ MAX_ITEMS = 100 # How many lines to read from the input file. Set to 0 for unlimited
2454NUM_WORKERS = 3 # For 10 requests per second try 2-5
2555REQUEST_TIMEOUT_SECONDS = 5 # For individual HTTP requests. Default is 1
2656RETRY_MAX_TRIES = 10 # How often to retry if a HTTP request times out
2757RETRY_MAX_TIME = 60 # Limit in seconds for retries
2858SHOW_PROGRESS = True # Show progress bar
2959
60+
61+
62+
63+
64+
65+
66+
67+
68+ if os .path .exists (FILENAME_OUTPUT_CSV ):
69+ sys .stderr .write (f"The output file '{ FILENAME_OUTPUT_CSV } ' already exists.\n " )
70+ sys .exit (1 )
71+
3072csv_writer = csv .writer (open (FILENAME_OUTPUT_CSV , 'w' , encoding = 'utf8' , newline = '' ))
3173
3274PROGRESS_BAR = SHOW_PROGRESS and tqdm (total = 0 , position = 0 , desc = "Addresses geocoded" , dynamic_ncols = True )
3375
34- async def write_one_geocoding_result (geocoding_results , address , address_id ):
35- if geocoding_results is not None and len (geocoding_results ):
36- first_result = geocoding_results [0 ]
76+ # '40.78,-73.97' => true
77+ # '3rd Ave, New York' => false
78+ def guess_text_is_coordinate_pair (text ):
79+ coordinate_pattern = r'^(-?\d+(\.\d+)?),(-?\d+(\.\d+)?)$'
80+ # x = 'yes' if bool(re.search(coordinate_pattern, text)) else 'no'
81+ # sys.stderr.write(f"{text} is coordinate_pair: {x}\n")
82+ return bool (re .search (coordinate_pattern , text ))
83+
84+ async def write_one_geocoding_result (geocoding_result , address , address_id ):
85+ # print(geocoding_result, file=sys.stderr)
86+ if geocoding_result is not None :
3787 row = [
3888 address_id ,
39- first_result ['geometry' ]['lat' ],
40- first_result ['geometry' ]['lng' ],
89+ geocoding_result ['geometry' ]['lat' ],
90+ geocoding_result ['geometry' ]['lng' ],
4191 # Any of the components might be empty:
42- first_result ['components' ].get ('_type' , '' ),
43- first_result ['components' ].get ('country' , '' ),
44- first_result ['components' ].get ('county' , '' ),
45- first_result ['components' ].get ('city' , '' ),
46- first_result ['components' ].get ('postcode' , '' ),
47- first_result ['components' ].get ('road' , '' ),
48- first_result ['components' ].get ('house_number' , '' ),
49- first_result ['confidence' ],
50- first_result ['formatted' ]
92+ geocoding_result ['components' ].get ('_type' , '' ),
93+ geocoding_result ['components' ].get ('country' , '' ),
94+ geocoding_result ['components' ].get ('county' , '' ),
95+ geocoding_result ['components' ].get ('city' , '' ),
96+ geocoding_result ['components' ].get ('postcode' , '' ),
97+ geocoding_result ['components' ].get ('road' , '' ),
98+ geocoding_result ['components' ].get ('house_number' , '' ),
99+ geocoding_result ['confidence' ],
100+ geocoding_result ['formatted' ]
51101 ]
52102
53103 else :
104+ sys .stderr .write (f"not found, writing empty result: { address } \n " )
54105 row = [
55106 address_id ,
56107 0 , # not to be confused with https://en.wikipedia.org/wiki/Null_Island
@@ -65,7 +116,6 @@ async def write_one_geocoding_result(geocoding_results, address, address_id):
65116 - 1 , # confidence values are 1-10 (lowest to highest), use -1 for unknown
66117 ''
67118 ]
68- sys .stderr .write (f"not found, writing empty result: { address } \n " )
69119 csv_writer .writerow (row )
70120
71121
@@ -84,24 +134,32 @@ def backoff_hdlr(details):
84134 on_backoff = backoff_hdlr )
85135async def geocode_one_address (address , address_id ):
86136 async with OpenCageGeocode (API_KEY ) as geocoder :
87- # address -> coordinates
88- # note: you may also want to set other optional parameters like
89- # countrycode, language, etc
90- # see the full list: https://opencagedata.com/api#forward-opt
137+ global FORWARD_OR_REVERSE
91138 try :
92- geocoding_results = await geocoder .geocode_async (address , no_annotations = 1 )
139+ if FORWARD_OR_REVERSE == 'reverse' or \
140+ (FORWARD_OR_REVERSE == 'guess' and guess_text_is_coordinate_pair (address )):
141+ # Reverse:
142+ # coordinates -> address, e.g. '40.78,-73.97' => '101, West 91st Street, New York'
143+ lon_lat = address .split (',' )
144+ geocoding_results = await geocoder .reverse_geocode_async (
145+ lon_lat [0 ], lon_lat [1 ], no_annotations = 1 )
146+ else :
147+ # Forward:
148+ # address -> coordinates
149+ # note: you may also want to set other optional parameters like
150+ # countrycode, language, etc
151+ # see the full list: https://opencagedata.com/api#forward-opt
152+ geocoding_results = await geocoder .geocode_async (address , no_annotations = 1 )
93153 except Exception as exc :
94- geocoding_results = None
95154 traceback .print_exception (exc , file = sys .stderr )
96155
97- # coordinates -> address, e.g. '40.78,-73.97' => 101, West 91st Street, New York
98- # lon_lat = address.split(',')
99- # geocoding_result = await geocoder.reverse_geocode_async(lon_lat[0], lon_lat[1], no_annotations=1)
100- # returns a single result so we convert it to a list
101- # geocoding_results = [geocoding_result]
102-
103156 try :
104- await write_one_geocoding_result (geocoding_results , address , address_id )
157+ if geocoding_results is not None and len (geocoding_results ):
158+ geocoding_result = geocoding_results [0 ]
159+ else :
160+ geocoding_result = None
161+
162+ await write_one_geocoding_result (geocoding_result , address , address_id )
105163 except Exception as exc :
106164 traceback .print_exception (exc , file = sys .stderr )
107165
@@ -127,7 +185,7 @@ async def run_worker(worker_name, queue):
127185
128186async def main ():
129187 global PROGRESS_BAR
130- assert sys .version_info >= (3 , 7 ), "Script requires Python 3.7+. "
188+ assert sys .version_info >= (3 , 7 ), "Script requires Python 3.7 or newer "
131189
132190 ## 1. Read CSV into a Queue
133191 ## Each work_item is an address and id. The id will be part of the output,
@@ -138,16 +196,23 @@ async def main():
138196 ##
139197 queue = asyncio .Queue (maxsize = MAX_ITEMS )
140198
141- csv_reader = csv .reader (open (FILENAME_INPUT_CSV , 'r' ), strict = True , skipinitialspace = True )
199+ with open (FILENAME_INPUT_CSV , 'r' , encoding = 'utf-8' ) as infile :
200+ csv_reader = csv .reader (infile , strict = True , skipinitialspace = True )
201+
202+ for row in csv_reader :
203+ if len (row ) == 0 :
204+ raise Exception (f"Empty line in input file at line number { csv_reader .line_num } , aborting" )
142205
143- for row in csv_reader :
144- if len (row ) == 0 :
145- raise Exception (f"Empty line in input file at line number { csv_reader .line_num } , aborting" )
206+ if FORWARD_OR_REVERSE == 'reverse' or \
207+ (FORWARD_OR_REVERSE == 'guess' and len (row ) > 2 and \
208+ guess_text_is_coordinate_pair (f"{ row [1 ]} ,{ row [2 ]} " )):
209+ work_item = {'id' : row [0 ], 'address' : f"{ row [1 ]} ,{ row [2 ]} " }
210+ else :
211+ work_item = {'id' : row [0 ], 'address' : row [1 ]}
146212
147- work_item = {'id' : row [0 ], 'address' : row [1 ]}
148- await queue .put (work_item )
149- if queue .full ():
150- break
213+ await queue .put (work_item )
214+ if queue .full ():
215+ break
151216
152217 sys .stderr .write (f"{ queue .qsize ()} work_items in queue\n " )
153218
0 commit comments