Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Optional environment variables to take into account during build and run time of the containers
# For using, please adapt to a local .env file

MARA_PROJECT_NAME=mara-example
4 changes: 2 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@

/app/local_setup.py
/packages
/data
/.venv
/app/bigquery_downloader/bigquery-credentials.json

*.pyc
.coverage
/.idea
/.vscode
/.project
.DS_Store
/.pg-data
.env
38 changes: 38 additions & 0 deletions .scripts/docker/mara-app/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
FROM python:3.7

# Default working directory
WORKDIR /mara

# Install dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
git \
dialog \
coreutils \
graphviz \
python3-dev \
python3-venv \
rsync \
nano \
telnet \
postgresql \
zsh \
sudo \
less \
&& locale-gen en_US.UTF-8

COPY ./docker-entrypoint.sh /mara/
RUN ["chmod", "+x", "/mara/docker-entrypoint.sh"]

# User and permissions
ARG UID=1000
RUN useradd -ms /bin/bash ${UID} && echo "${UID}:${UID}" | chpasswd && adduser ${UID} sudo
RUN echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers
USER ${UID}:${UID}

# Install and configure OhMyZSH
ENV TERM xterm
ENV ZSH_THEME robbyrussell
RUN wget -O- https://raw.githubusercontent.com/ohmyzsh/ohmyzsh/master/tools/install.sh | zsh

# Default container's command
CMD ["./docker-entrypoint.sh"]
26 changes: 26 additions & 0 deletions .scripts/docker/postgres/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
FROM postgres:11

# Build dependencies
ARG buildDeps='git libpq-dev libprotobuf-c0-dev make postgresql-server-dev-11 protobuf-c-compiler gcc g++'

# Install extensions' dependencies
RUN apt-get update \
&& apt-get install -y ${buildDeps}

# Install cstore-fdw extension
RUN cd /tmp && git clone -b v1.6.2 https://github.com/citusdata/cstore_fdw.git
RUN cd /tmp/cstore_fdw && PATH=/usr/local/pgsql/bin/:$PATH make && PATH=/usr/local/pgsql/bin/:$PATH make install

# Remove un-needed build dependencies
RUN apt-get purge -y --auto-remove ${buildDeps} \
&& rm -rf /var/lib/apt/lists/*

# Copy custom configuration
COPY mara-postgres.conf /etc/postgresql/postgresql.conf

# this adds the initdb.sql file to the postgres server folder /docker-entrypoint-initdb.d
# to run on initial database server start up. on start up, all .sql, .sh files in this folder
# are run, which makes it the best place to create a database we shall be using.
ADD initdb.sql /docker-entrypoint-initdb.d

CMD ["postgres", "-c", "config_file=/etc/postgresql/postgresql.conf"]
6 changes: 6 additions & 0 deletions .scripts/docker/postgres/initdb.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
CREATE DATABASE example_project_dwh;
CREATE DATABASE example_project_mara;

CREATE ROLE root SUPERUSER LOGIN;

CREATE EXTENSION IF NOT EXISTS cstore_fdw;
67 changes: 67 additions & 0 deletions .scripts/docker/postgres/mara-postgres.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# This is maintained by salt
# server
# data_directory = ''
listen_addresses = '*'
port = 5432

idle_in_transaction_session_timeout=180s


max_connections = 500

# mostly from pg_tune, just the work_mem is specificly tuned
# memory
shared_buffers = 4GB
effective_cache_size = 8GB # was default = 4GB
work_mem = 1GB
maintenance_work_mem = 1GB
min_wal_size = 512MB # was default = 80MB
max_wal_size = 1GB # was default = 1GB
checkpoint_completion_target = 0.9 # was default = 0.5
temp_buffers = 1GB
max_locks_per_transaction = 1024

# io
wal_level = minimal
max_wal_senders = 0
fsync = off
synchronous_commit = off
full_page_writes = off
wal_buffers = 16MB # was -1


random_page_cost = 1 # default is 4. On SSDs, there is no significant penalty for random page access
default_statistics_target = 500 # Analyze takes longer, but big queries tend to be better planned


# query log
logging_collector = on
# log_directory = '/bi/logs/postgresql/'
log_filename = 'query.log'

log_min_duration_statement = 200
debug_pretty_print = on
log_lock_waits = on

session_preload_libraries = 'auto_explain'
auto_explain.log_min_duration = '5'

auto_explain.log_nested_statements = true
auto_explain.log_verbose = true


lc_messages = 'C'
lc_monetary = 'C'
lc_numeric = 'C'
lc_time = 'C'


# vacuum
autovacuum_max_workers = 1

# timezone
timezone = 'Europe/Berlin'


# cstore
shared_preload_libraries = 'cstore_fdw'
2 changes: 0 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,3 @@ include .scripts/mara-app/install.mk
# if you don't want to download the two big
sync-bigquery-csv-data-sets-from-s3:
.venv/bin/aws s3 sync s3://mara-example-project-data data --delete --no-progress --no-sign-request


106 changes: 95 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -136,9 +136,92 @@ It shows its

 

## Getting started
## Installation

### Sytem requirements
Python and PostgreSQL are the main requirements of the project.
They can be run via [Docker](#running-python-and-postgresql-via-docker) or installed [natively](#installing-python-and-postgresql-natively).


### Running Python and PostgreSQL via Docker

Requirements: `docker`, `docker-compose`

Build the images, create and start the containers:

```console
$ MARA_PROJECT_NAME=mara-example docker-compose up --build
```

If the images are already built, then a simple `docker-compose up` will start the containers.

This will:
- create the `mara-postgres:mara-example-dev` and the `mara-app:mara-example-dev` images
- expose and serve a postgres instance at port 5432
- create a bind-mount of the application's codebase in order to avoid re-building in changes happening at the host
- create a named docker volume for managing the postgres db data
- Keep the `mara-app` container alive in developing mode after building by overwriting the default container's command in docker-compose

A custom container name is required for running multiple projects out of the same image by
setting the required environment variable ```MARA_PROJECT_NAME```.
This can be set as part of the `docker-compose` commands or
alternatively in a `.env` file (see [`.env.example`](.env.example)).
Default value is `mara-example`.

In order to gain access in the `mara-app` running container terminal, run:

```console
# For the mara-app container
$ docker exec -it mara-app zsh
```

The following example highlights how to access the Postgres database data (docker named volume) and log files from host:

```console
# Access PostgreSQL through the psql client

# From inside the container
$ psql -h mara-example-postgres -p 5432 -U postgres

# From host
$ psql -h localhost -p 5432 -U postgres

# View all docker volumes and retrieve the name of the Postgres data one
$ docker volume ls

# Output
DRIVER VOLUME NAME
local mara-example-project_mara-postgres-data

# Inspect named docker volume
Postgres data is stored in the path defined by the "Mountpoint" entry of the inspect command output
$ docker volume inspect mara-example-project_mara-postgres-data

# Output
[
{
"CreatedAt": "2020-02-24T12:17:24+01:00",
"Driver": "local",
"Labels": {
"com.docker.compose.project": "mara-example-project",
"com.docker.compose.version": "1.23.1",
"com.docker.compose.volume": "mara-postgres-data"
},
"Mountpoint": "/var/lib/docker/volumes/mara-example-project_mara-postgres-data/_data",
"Name": "mara-example-project_mara-postgres-data",
"Options": null,
"Scope": "local"
}
]

# Access Postgres query log
sudo tail -f /var/lib/docker/volumes/mara-example-project_mara-postgres-data/_data/log/query.log
```

 

### Installing Python and PostgreSQL natively

#### System requirements

Python >=3.6 and PostgreSQL >=10 and some smaller packages are required to run the example (and mara in general).

Expand All @@ -159,7 +242,7 @@ $ sudo apt install git dialog coreutils graphviz python3 python3-dev python3-ven

 

Mara does not run Windows.
Mara does not run on Windows natively.

 

Expand All @@ -169,19 +252,22 @@ To optimize PostgreSQL for ETL workloads, update your postgresql.conf along [thi

Start a database client with `sudo -u postgres psql postgres` and then create a user with `CREATE ROLE root SUPERUSER LOGIN;` (you can use any other name).

 

### Installation
#### Installation

Clone the repository somewhere. Copy the file [`app/local_setup.py.example`](app/local_setup.py.example) to `app/local_setup.py` and adapt to your machine.
In case of missing, `app/local_setup.py` will be created during the initialization of the application.

Log into PostgreSQL with `psql -U root postgres` and create two databases:
Log into PostgreSQL with `psql -u root postgres` and create two databases (If the Docker setup is used, the databases and roles are created as part of the build and defined in the [`.scripts/docker/postgres/initdb.sql`](.scripts/docker/postgres/initdb.sql) file):

```sql
CREATE DATABASE example_project_dwh;
CREATE DATABASE example_project_mara;
```

 

## Running the web UI

Hit `make` in the root directory of the project. This will

- create a virtual environment in `.venv`,
Expand All @@ -196,12 +282,10 @@ $ source .venv/bin/activate

To list all available flask cli commands, run `flask` without parameters.

 

### Running the web UI
In order to start the Flask application, run:

```console
$ flask run --with-threads --reload --eager-loading
$ make run-flask
```

The app is now accessible at [http://localhost:5000](http://localhost:5000).
Expand Down
1 change: 1 addition & 0 deletions app/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import pathlib
import sys
from shutil import copy

# configure application and packages
import app.data_integration
Expand Down
2 changes: 1 addition & 1 deletion app/data_integration/pipelines/github/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,4 +101,4 @@
description="Replaces the current gh_dim schema with the contents of gh_dim_next",
commands=[
ExecuteSQL(sql_statement="SELECT util.replace_schema('gh_dim', 'gh_dim_next');")
]))
]))
2 changes: 1 addition & 1 deletion app/data_integration/pipelines/pypi/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,4 +132,4 @@
description="Replaces the current pypi_dim schema with the contents of pypi_dim_next",
commands=[
ExecuteSQL(sql_statement="SELECT util.replace_schema('pypi_dim', 'pypi_dim_next');")
]))
]))
9 changes: 6 additions & 3 deletions app/local_setup.py.example
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import datetime
import os

import data_integration.config
import mara_acl.config
Expand All @@ -11,16 +12,18 @@ import app.config

@patch(mara_db.config.databases)
def databases():
# Retrieve Postgres db-host from container's environment, else localhost for a native postgres
db_host = os.environ.get('POSTGRES_HOST') if os.environ.get('POSTGRES_HOST') else 'localhost'
return {
# the project requires two databases: 'mara' for the app itself, and 'dwh' for the etl
'dwh': mara_db.dbs.PostgreSQLDB(user='root', host='localhost', database='example_project_dwh'),
'mara': mara_db.dbs.PostgreSQLDB(user='root', host='localhost', database='example_project_mara')
'dwh': mara_db.dbs.PostgreSQLDB(user='root', host=db_host, database='example_project_dwh'),
'mara': mara_db.dbs.PostgreSQLDB(user='root', host=db_host, database='example_project_mara')
}


# Disable http header based authentication
patch(mara_acl.config.require_email_http_header)(lambda: False)


# How many cores to use for running the ETL, defaults to the number of CPUs of the machine
# On production, make sure the ETL does not slow down other services too much
patch(data_integration.config.max_number_of_parallel_tasks)(lambda: 4)
Expand Down
Loading