I tried to geoparse some phrases but not all the city are matched (for example: 'Sciacca' and 'Asciano').
Note that all the city are present on the database and all the phrases are correctly tokenized.
EDIT: I noticed that if I manually whitelist the cities everything works fine, but why are they not shown directly?
import soton_corenlppy
import geoparsepy
import logging
logger = logging.getLogger("geoparsepy")
logging.basicConfig(level=logging.INFO, format="%(message)s")
logger.info('Logging started')
geospatial_config = geoparsepy.geo_parse_lib.get_geoparse_config(
lang_codes=['it', 'en'],
logger=logger
)
location_ids = {}
focus_areas = ['global_cities', 'europe_places', 'north_america_places', 'uk_places']
for focus_area in focus_areas:
location_ids[focus_area + '_admin'] = [-1, -1]
location_ids[focus_area + '_poly'] = [-1, -1]
location_ids[focus_area + '_line'] = [-1, -1]
location_ids[focus_area + '_point'] = [-1, -1]
# Create a connection with the database
database_handler = soton_corenlppy.PostgresqlHandler.PostgresqlHandler(
user='postgres',
passw=' ',
hostname='localhost',
port=5432,
database='openstreetmap'
)
# Load a set of previously preprocessed locations from database
cached_locations = geoparsepy.geo_preprocess_lib.cache_preprocessed_locations(
database_handle=database_handler,
location_ids=location_ids,
schema='public',
geospatial_config=geospatial_config
)
logger.info(f"Loaded {len(cached_locations)} position")
# Close connection with the database
database_handler.close()
# Compile an inverted index from a list of arbirary data where one column is a phrase string
indexed_locations = geoparsepy.geo_parse_lib.calc_inverted_index(
list_data=cached_locations,
dict_geospatial_config=geospatial_config
)
logger.info(f"Indexed {len(indexed_locations.keys())} phrases")
# Create an index of osmid to row indexes in the cached_locations
osmid_lookup = geoparsepy.geo_parse_lib.calc_osmid_lookup(cached_locations=cached_locations)
listText = [
u'hello New York, USA its Bill from Bassett calling',
u'live on the BBC Victoria Derbyshire is visiting Derbyshire for an exclusive UK interview',
u'Domani vado a Roma, nel Lazio',
u'Io sono di Sciacca, in provincia di agrigento',
u'Vengo dalla provincia di Agrigento, in Sicilia',
u'Mi sdraio sul prato del mio vicino',
u'Pavia e Ravenna sono belle città',
u'Voglio andare a new york',
u'Mi trovo a San Giuliano Terme',
u'Io sono di Sciacca, in provincia di Agrigento',
u'Martina vive a Nuoro ma vorrebbe andare ad Agrigento',
u'Agrigento è la provincia che contiene il comune di Sciacca',
u'Vicino san giuliano terme c\'è un comune che si chiama Asciano',
u'La città di Sciacca si trova in provincia di Agrigento',
u'Mi trovo a Sciacca'
]
listTokenSets = []
for text in listText:
# Tokenize a text entry into unigram tokens text will be cleaned and tokenize
listToken = soton_corenlppy.common_parse_lib.unigram_tokenize_text(
text=text,
dict_common_config=geospatial_config
)
listTokenSets.append(listToken)
# Geoparse token sets using a set of cached locations
listMatchSet = geoparsepy.geo_parse_lib.geoparse_token_set(
token_set=listTokenSets,
dict_inverted_index=indexed_locations,
dict_geospatial_config=geospatial_config
)
# Print the matched location
for i in range(len(listMatchSet)):
logger.info(f"\nText: {listText[i]}")
listMatch = listMatchSet[i]
for tupleMatch in listMatch:
logger.info(str(tupleMatch))
C:\Users\calog\PycharmProjects\geoparsepy\venv\Scripts\python.exe C:/Users/calog/PycharmProjects/geoparsepy/main2.py
Logging started
loading stoplist from C:\Users\calog\PycharmProjects\geoparsepy\venv\lib\site-packages\geoparsepy\corpus-geo-stoplist-it.txt
loading stoplist from C:\Users\calog\PycharmProjects\geoparsepy\venv\lib\site-packages\geoparsepy\corpus-geo-stoplist-en.txt
loading whitelist from C:\Users\calog\PycharmProjects\geoparsepy\venv\lib\site-packages\geoparsepy\corpus-geo-whitelist.txt
loading blacklist from C:\Users\calog\PycharmProjects\geoparsepy\venv\lib\site-packages\geoparsepy\corpus-geo-blacklist.txt
loading building types from C:\Users\calog\PycharmProjects\geoparsepy\venv\lib\site-packages\geoparsepy\corpus-buildingtype-it.txt
loading location type corpus C:\Users\calog\PycharmProjects\geoparsepy\venv\lib\site-packages\geoparsepy\corpus-buildingtype-it.txt
- 0 unique titles
- 61 unique types
loading street types from C:\Users\calog\PycharmProjects\geoparsepy\venv\lib\site-packages\geoparsepy\corpus-streettype-it.txt
loading location type corpus C:\Users\calog\PycharmProjects\geoparsepy\venv\lib\site-packages\geoparsepy\corpus-streettype-it.txt
- 10 unique titles
- 14 unique types
loading admin types from C:\Users\calog\PycharmProjects\geoparsepy\venv\lib\site-packages\geoparsepy\corpus-admintype-it.txt
loading location type corpus C:\Users\calog\PycharmProjects\geoparsepy\venv\lib\site-packages\geoparsepy\corpus-admintype-it.txt
- 10 unique titles
- 0 unique types
loading building types from C:\Users\calog\PycharmProjects\geoparsepy\venv\lib\site-packages\geoparsepy\corpus-buildingtype-en.txt
loading location type corpus C:\Users\calog\PycharmProjects\geoparsepy\venv\lib\site-packages\geoparsepy\corpus-buildingtype-en.txt
- 3 unique titles
- 76 unique types
loading street types from C:\Users\calog\PycharmProjects\geoparsepy\venv\lib\site-packages\geoparsepy\corpus-streettype-en.txt
loading location type corpus C:\Users\calog\PycharmProjects\geoparsepy\venv\lib\site-packages\geoparsepy\corpus-streettype-en.txt
- 15 unique titles
- 32 unique types
loading admin types from C:\Users\calog\PycharmProjects\geoparsepy\venv\lib\site-packages\geoparsepy\corpus-admintype-en.txt
loading location type corpus C:\Users\calog\PycharmProjects\geoparsepy\venv\lib\site-packages\geoparsepy\corpus-admintype-en.txt
- 14 unique titles
- 0 unique types
loading gazeteer from C:\Users\calog\PycharmProjects\geoparsepy\venv\lib\site-packages\geoparsepy\gazeteer-en.txt
caching locations : {'global_cities_admin': [-1, -1], 'global_cities_poly': [-1, -1], 'global_cities_line': [-1, -1], 'global_cities_point': [-1, -1], 'europe_places_admin': [-1, -1], 'europe_places_poly': [-1, -1], 'europe_places_line': [-1, -1], 'europe_places_point': [-1, -1], 'north_america_places_admin': [-1, -1], 'north_america_places_poly': [-1, -1], 'north_america_places_line': [-1, -1], 'north_america_places_point': [-1, -1], 'uk_places_admin': [-1, -1], 'uk_places_poly': [-1, -1], 'uk_places_line': [-1, -1], 'uk_places_point': [-1, -1]}
Loaded 800820 position
Indexed 605884 phrases
Text: hello New York, USA its Bill from Bassett calling
(1, 2, {(61785451,), (-175905,), (151937435,), (316976734,), (2218262347,), (29457403,), (-61320,)}, ('new', 'york'))
(2, 2, {(153924230,), (151528825,), (158656063,), (20913294,), (151672942,), (153595296,), (153968758,), (316990182,), (151651405,), (-134353,), (-1425436,), (153473841,)}, ('york',))
(4, 4, {(-148838,)}, ('usa',))
(8, 8, {(253067120,), (151840681,), (151463868,)}, ('bassett',))
Text: live on the BBC Victoria Derbyshire is visiting Derbyshire for an exclusive UK interview
(4, 4, {(75538688,), (385402175,), (151521359,), (74701108,), (-5606595,), (462241727,), (151395812,), (460070685,), (447925715,), (277608416,), (-1828436,), (-407423,), (154301948,), (-2316741,), (435240340,), (-5606596,), (463188523,), (151336948,), (151476805,), (30189922,), (158651084,), (-2256643,), (-10307525,)}, ('victoria',))
(8, 8, {(-195384,)}, ('derbyshire',))
(12, 12, {(-62149,)}, ('uk',))
Text: Domani vado a Roma, nel Lazio
(1, 1, {(151686158,)}, ('vado',))
(3, 3, {(385056116,), (-41313,)}, ('roma',))
(6, 6, {(-40784,)}, ('lazio',))
Text: Io sono di Sciacca, in provincia di agrigento
Text: Vengo dalla provincia di Agrigento, in Sicilia
(7, 7, {(-39152,)}, ('sicilia',))
Text: Mi sdraio sul prato del mio vicino
(3, 3, {(-42619,)}, ('prato',))
Text: Pavia e Ravenna sono belle città
(0, 0, {(158289705,), (-43483,), (230101550,)}, ('pavia',))
(2, 2, {(154313500,), (151333458,), (151866924,), (154149873,), (-42889,)}, ('ravenna',))
(4, 4, {(154337430,)}, ('belle',))
Text: Voglio andare a new york
(3, 4, {(61785451,), (-175905,), (151937435,), (316976734,), (2218262347,), (29457403,), (-61320,)}, ('new', 'york'))
(4, 4, {(153924230,), (151528825,), (158656063,), (20913294,), (151672942,), (153595296,), (153968758,), (316990182,), (151651405,), (-134353,), (-1425436,), (153473841,)}, ('york',))
Text: Mi trovo a San Giuliano Terme
(1, 1, {(62515792,)}, ('trovo',))
(3, 4, {(4594763552,), (130871200,), (6986638289,), (6008076012,), (3653962105,), (1213463381,), (5318245098,), (2815922128,)}, ('san', 'giuliano'))
(3, 5, {(258512997,)}, ('san', 'giuliano', 'terme'))
(5, 5, {(27013444,), (-1837372,)}, ('terme',))
Text: Io sono di Sciacca, in provincia di Agrigento
Text: Martina vive a Nuoro ma vorrebbe andare ad Agrigento
(3, 3, {(-39979,)}, ('nuoro',))
(8, 8, {(-39151,)}, ('agrigento',))
Text: Agrigento è la provincia che contiene il comune di Sciacca
(0, 0, {(-39151,)}, ('agrigento',))
Text: Vicino san giuliano terme c'è un comune che si chiama Asciano
(1, 2, {(4594763552,), (130871200,), (6986638289,), (6008076012,), (3653962105,), (1213463381,), (5318245098,), (2815922128,)}, ('san', 'giuliano'))
(1, 3, {(258512997,)}, ('san', 'giuliano', 'terme'))
(3, 3, {(27013444,), (-1837372,)}, ('terme',))
Text: La città di Sciacca si trova in provincia di Agrigento
Text: Mi trovo a Sciacca
(1, 1, {(62515792,)}, ('trovo',))