Commit 963b0310 authored by Moritz Schott's avatar Moritz Schott
Browse files

Create basic configuration and setup framework

parent da25f476
# Project
# Python
# PyCharm
# Docker
- repo:
rev: 21.12b0
- id: black
language_version: python3.8
- repo:
rev: 4.0.1
- id: flake8
- repo: local
- id: pylint
name: pylint
entry: poetry run pylint
language: system
types: [python]
"-rn", # Only display messages
"-sn", # Don't display the score
- repo:
rev: v4.1.0
- id: check-added-large-files
- id: check-ast
- id: check-json
- id: end-of-file-fixer
- id: mixed-line-ending
args: [ '--fix=lf' ]
- id: name-tests-test
- id: pretty-format-json
args: [ '--no-sort-keys' ]
- id: trailing-whitespace
- id: check-yaml
- repo:
rev: v1.4.0
- id: dead
- repo:
rev: v1.1.0
- id: pycln
- repo:
rev: v6.2.1
- id: beautysh
- repo:
rev: v2.1.0
- id: pretty-format-java
- repo:
rev: 5.10.1
- id: isort
args: ["--profile", "black"]
# Run vectorisation tool using a custom config
FROM python:3.8
# within docker container: run without root privileges
RUN useradd -md /home/oev oev
WORKDIR /oev/oev
RUN chown oev:oev . -R
USER oev:oev
# make poetry binaries available to the docker container user
ENV PATH=$PATH:/home/oev/.local/bin
# install only the dependencies
COPY --chown=oev:oev pyproject.toml pyproject.toml
COPY --chown=oev:oev poetry.lock poetry.lock
RUN pip install --no-cache-dir "poetry==1.1.12"
RUN python -m poetry install --no-ansi --no-interaction --no-root
# copy all the other files and install the project
COPY --chown=oev:oev ./ ./
RUN python -m poetry install --no-ansi --no-interaction
ENTRYPOINT ["python", "-m", "poetry", "run", "vectorise"]
# Setup database with required data
FROM osgeo/gdal:ubuntu-small-3.4.1
# within docker container: run without root privileges
RUN useradd -md /home/oev oev
WORKDIR /oev/oev
RUN chown oev:oev . -R
USER oev:oev
COPY --chown=oev:oev ./src/osm_element_vectorisation/setup/
version: "3.8"
image: postgis/postgis:14-3.2-alpine
container_name: postgisdb
shm_size: 1g
- ./docker-volume-data/postgres:/var/lib/postgresql/data
- "5432:5432"
POSTGRES_PASSWORD: postgresPassword
pipeline {
agent {
dockerfile {
dir 'Docker'
label 'vectorising-test'
stages {
stage('Test') {
steps {
sh 'python -m poetry run pytest ./test'
post {
failure {
rocketSend channel: 'idealvgi-oshdb', emoji: ':sob:' , message: "A build for the vectorising tools failed: (<${env.BUILD_URL}|Open Build in Jenkins>). Latest commit from ${LATEST_AUTHOR}. Review the code!" , rawMessage: true
This diff is collapsed.
# OSM Element Vectorisation
This tool transfromes single OSM elements into feature vectors for attribute investiation and machine learning.
\ No newline at end of file
This tool transforms single OSM elements into feature vectors for attribute investigation and machine learning.
# Installation
We provide two installation procedures depending on your needs and skill. The **quick start** requires only little resources and will execute a minimal example you can play with. The **full installation** will set you up with a fully working tool but needs considerable resources and time to set up. You can activate either by adding `repex` or `full` to the `docker run` commands shown below.
Both methods will need [Docker]( and [docker-compose]( installed. If you do not want to use Docker, you can extract the _setup_ procedure from this [dockerfile](Docker/ and the _installation_ procedure from this [dockerfile](Docker/Dockerfile). In that case you must also set up your own database.
## Backend
The tool uses a [postgres]( - [postgis]( database as backend. [docker-compose.yaml](Docker/docker-compose.yaml) provides a basic setup but of course you should adapt it to your needs (user management, security etc.). For now, you can start the server running `docker-compose -f Docker/docker-compose.yaml up -d`.
## Data
The tool is resilient against any missing backend data, but the results will be very limited. The [](Docker/ will populate the backend database with all necessary datasets to produce meaningful results.
`docker build -t data-population -f Docker/ .`
`docker run data-population <mode>`
## Run Tool
`docker build -t vectorisation -f Docker/Dockerfile .`
`docker run vectorisation <mode>`
# Contribution Guidelines
You are welcome to contribute to this tool. The easiest way would be filing a concise [issue]( Yet direct code collaboration is also very welcome. In that case please install the dependency manager [poetry]( and run `poetry install` to create a local development environment. Before committing any edits, make sure you activate the [pre-commit hooks]( for cleaner code using `poetry run pre-commit install`. Commit your edits to a [fork]( and create a [pull-request](
This diff is collapsed.
name = "osm_element_vectorisation"
version = "0.1"
description = "This tool transformes single OSM elements to machine readable vectors of features or attributes (sometimes called embeddings)."
license = "GPL-3.0-or-later"
authors = ["Moritz Schott <>"]
readme = ""
repository = ""
keywords = [
"feature creation",
"data analyses",
classifiers = [
"Development Status :: 4 - Beta",
"Intended Audience :: End Users/Desktop",
"Intended Audience :: Science/Research",
"Natural Language :: English",
"Operating System :: POSIX :: Linux",
"Programming Language :: Python :: 3.8",
"Programming Language :: Java",
"Programming Language :: SQL",
"Programming Language :: R",
"Programming Language :: Unix Shell",
"Topic :: Scientific/Engineering :: GIS",
"Topic :: Software Development :: Libraries :: Python Modules"
python = "^3.8"
ohsome = "^0.1.0-rc.2"
psycopg2 = "^2.9.3"
argparse = "^1.4.0"
sentinelhub = {version = "^3.4.3", optional = true}
pre-commit = "^2.16.0"
pylint = "^2.12.2"
pytest = "^7.0.1"
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"
vectorise = "osm_element_vectorisation.vectorisation:vectorise_cli"
sentinelhub = ["sentinelhub"]
# configuration for pre-commit hooks
# increase max-line-length to be compatible with default value of black
max-line-length = 88
min-public-methods = 0
max-args = 6
"""Python package to analyse single OSM elements."""
"""Modules to configure the vectorisation process."""
"""The command line interface."""
import argparse
import json
from osm_element_vectorisation.config.config import parse_config, read_config_file
from osm_element_vectorisation.config.enums import GeomTypes, ProcessingSteps
def parse_cl_args():
"""Parse the command line into a configuration.
:return: The configuration
parser = argparse.ArgumentParser(
description="Run vectorisation workflow.",
epilog="A tool by GIScience Heidelberg.",
help="The path to the config file. See the documentation on how to write a "
"config file and which defaults are assumed. Note that command line "
"arguments given here overwrite configurations from the config file.",
bbox_group = parser.add_argument_group(
title="Area of interest",
description="The area of interest to be analysed. Note: Only elements that lie "
"completely inside the given bbox (including some buffer) are "
"analysed. Errors may occur where objects were (temporarily) "
"located far from their current location in their history. "
"Overwrites config `aoi`. Choose one of "
"the following notations:",
bbox_mutex_group = bbox_group.add_mutually_exclusive_group()
help="Qualified table name (within the target database) and ID of the AOI "
"geometry. The geometry must be stored in a postgis column named `geom` "
"and the id in an `id`-column.",
id_group = parser.add_argument_group(
title="Select Elements",
description="OSM-IDs to which the extraction should be limited in format "
"`type/id` (e.g. way/12376). This is useful if you want to analyse "
"a defined set of objects within the bounding-box. Overwrites "
"config `limit_ids`. You can choose between two formats:",
id_mutex_group = id_group.add_mutually_exclusive_group()
id_mutex_group.add_argument("--ids", metavar="id", nargs="+", help="A list of ids.")
help="A path to a csv file containing the ids.",
choices=[state.value for state in GeomTypes],
help="The type of geometry to be analysed. This includes multi- elements.",
help="Timestamp (ISO 8601) in relation to which the data will be analysed.",
help="Schema name for data storage. If it exists already, it will be extended "
"with the requested data, otherwise it will be created.",
choices=[state.value for state in ProcessingSteps],
help="Which workflow stages should be executed.",
args = parser.parse_args()
config_dict = read_config_file(args.config_file)
# overwrite config file with command line arguments. Any missing values will be
# handled while parsing.
if args.bbox or args.aoi_table:
config_dict["aoi"] = args.bbox or args.aoi_table
if args.timestamp:
config_dict["timestamp"] = args.timestamp
if args.ids:
config_dict["limit_ids"] = json.dumps(args.ids)
elif args.id_file:
config_dict["limit_ids"] = args.id_file
if args.geometry_type:
config_dict["geometry_type"] = args.geometry_type
if args.data_schema:
config_dict["data_schema"] = args.data_schema
if args.stop_after:
config_dict["stop_after"] = args.stop_after
return parse_config(config_dict)
"""Class to handle configuration parameters."""
import json
import logging
import pathlib
from datetime import datetime
from json import JSONDecodeError
from ohsome import OhsomeClient
from psycopg2 import OperationalError
from osm_element_vectorisation.config.db_parameters import (
from osm_element_vectorisation.config.enums import GeomTypes, ProcessingSteps
from osm_element_vectorisation.config.keys import Keys
from osm_element_vectorisation.config.ohsome import OhsomeClients
from osm_element_vectorisation.config.oshdb import OshdbConn
from osm_element_vectorisation.config.processing import Processing
class Settings:
"""Basic Settings to be provided by user."""
def __init__(
processing: Processing,
db_parameters: DBParameters,
ohsome: OhsomeClients = None,
oshdb: OshdbConn = None,
keys: Keys = None,
"""Initialise BaseSettings Object.
:param processing: processing configuration
:param db_parameters: a connection and settings object
:param ohsome: ohsome clients
:param keys: keys object
:param oshdb: connection details to an oshdb data extract
self.processing = processing
self.db_parameters = db_parameters
self.ohsome = ohsome or OhsomeClients()
self.keys = keys or Keys()
self.oshdb = oshdb
def read_config_file(config_path: str):
"""Read a JSON file and return it as dictionary.
:param config_path: Path to .json file
:return: a dictionary of the content
config_file = pathlib.Path(config_path)
with open(config_file, "r", encoding="utf-8") as config:
configuration = json.load(config)
except JSONDecodeError as json_exception:
raise ValueError(
"Could not parse the provided JSON file. Make sure it "
"contains valid JSON"
) from json_exception
return configuration
def parse_config(config_dict: dict):
"""Parse configuration to create Settings object.
:param config_dict: dictionary with config details
:return: Settings object containing configuration details
processing = parse_minimum_required(config_dict)
db_parameters = parse_db_settings(config_dict)
args = parse_optional_args(config_dict, processing)
out = Settings(db_parameters=db_parameters, **args)
return out
def parse_minimum_required(config_dict: dict):
"""Check configuration dict.
:param config_dict:
aoi = config_dict["aoi"]
except KeyError as config_error:
raise ValueError("The aoi was not specified") from config_error
timestamp = datetime.fromisoformat(config_dict["timestamp"])
except KeyError as config_error:
raise ValueError("The targeted timestamp was not specified") from config_error
return Processing(aoi=aoi, timestamp=timestamp)
def parse_optional_args(config_dict: dict, processing: Processing):
"""Replace default optional if given.
:param config_dict:
:param processing:
out = {}
processing.limit_ids = config_dict.get("limit_ids", processing.limit_ids)
processing.geom_type = GeomTypes(config_dict.get("geom_type", processing.geom_type))
processing.stop_after = ProcessingSteps(
config_dict.get("stop_after", processing.stop_after)
processing.comment = config_dict.get("comment", processing.comment)
processing.log_level = logging.getLevelName(
config_dict.get("log_level", processing.log_level)
out["processing"] = processing
keys = Keys()
keys.osmcha_api_key = config_dict.get("osmcha_api_key", keys.osmcha_api_key)
keys.sentinelhub_key = config_dict.get("sentinelhub_id", keys.sentinelhub_key)
out["keys"] = keys
ohsome = OhsomeClients()
if config_dict.get("ohsomeURL"):
ohsome = OhsomeClients(ohsome_url=config_dict.get("ohsomeURL"))
if config_dict.get("ohsomeURL_download"): = OhsomeClient(
out["ohsome"] = ohsome
if config_dict.get("oshdb"):
if config_dict["oshdb"].get("oshdb"):
out["oshdb"] = OshdbConn(
"OSHDB settings incomplete, missing connection string. Will ignore "
"oshdb-based indicators."
return out
def create_dbconn(config_dict: dict, db_name: str):
"""Create a default connection using the configuration dictionary.
:param config_dict:
:param db_name:
if config_dict.get(db_name):
return get_default_conn(**config_dict.get(db_name))
return None
def parse_db_settings(config_dict: dict):
"""Parse a configuration dictionary for database connection settings.
:param config_dict:
db_parameters = {}
db_parameters["target_db"] = (
create_dbconn(config_dict, "target_database_conn") or get_default_conn()
if config_dict.get("data_schema"):
db_parameters["data_schema"] = config_dict.get("data_schema")
db_parameters["changesets_db"] = create_dbconn(config_dict, "changesets_db")
db_parameters["notes_db"] = create_dbconn(config_dict, "notes_db")
except OperationalError as db_exception:
raise Exception(
"Cannot connect to the database with the provided connection parameters. "
"Did you forget to overwrite a default?"
) from db_exception
return DBParameters(**db_parameters)
"""Databes connection parameters."""
# pylint: disable=too-many-arguments
from psycopg2 import OperationalError
from psycopg2.pool import ThreadedConnectionPool
from psycopg2.sql import Identifier
class DBParameters:
"""Databes connection parameters."""
def __init__(
target_db: ThreadedConnectionPool,
changesets_db: ThreadedConnectionPool = None,
notes_db: ThreadedConnectionPool = None,
data_schema: str = "osm_vectorisation",
"""Init database connection parameters.
:param target_db: DBParameters object for target database
:param data_schema: name of the schema the data should be written to
:param notes_db: DBParameters object for notes database
:param changesets_db: DBParameters object for changesets database
self.target_db = target_db
self.changesets_db = changesets_db
self.notes_db = notes_db
self.identifiers = get_identifiers(data_schema)
def get_identifier_str(self, identifier):
"""Transform identifier as string.
:param identifier: key of target_db.identifier dict
:return: string containing schemas and tables specified in Identifier
return ".".join(self.identifiers[identifier].strings)
def get_identifiers(schema: str):
"""Construct database object identifiers.
:param schema: The data schema name