Database to File
Examples of using Sling to load data from databases to storage systems
Last updated
Examples of using Sling to load data from databases to storage systems
Last updated
We first need to make sure our connections are available in our environment. See , and for more details.
export MY_SOURCE_DB='...'
$ sling conns list
+---------------+------------------+-----------------+
| CONN NAME | CONN TYPE | SOURCE |
+---------------+------------------+-----------------+
| MY_S3_BUCKET | FileSys - S3 | sling env yaml |
| MY_SOURCE_DB | DB - PostgreSQL | env variable |
| MY_GS_BUCKET | FileSys - Google | sling env yaml |
| MY_AZURE_CONT | FileSys - Azure | sling env yaml |
+---------------+------------------+-----------------+
# using windows Powershell
$env:MY_SOURCE_DB = '...'
$ sling conns list
+---------------+------------------+-----------------+
| CONN NAME | CONN TYPE | SOURCE |
+---------------+------------------+-----------------+
| MY_S3_BUCKET | FileSys - S3 | sling env yaml |
| MY_SOURCE_DB | DB - PostgreSQL | env variable |
| MY_GS_BUCKET | FileSys - Google | sling env yaml |
| MY_AZURE_CONT | FileSys - Azure | sling env yaml |
+---------------+------------------+-----------------+
Using
$ sling run --src-conn MY_SOURCE_DB \
--src-stream 'source_schema.source_table' \
--tgt-object 'file:///tmp/my_file.csv'
$ sling run --src-conn MY_SOURCE_DB \
--src-stream 'source_schema.source_table' \
--tgt-object 'file:///tmp/my_csv_folder/*.csv'
$ sling run --src-conn MY_SOURCE_DB \
--src-stream 'source_schema.source_table' \
--tgt-object 'file:///tmp/my_csv_folder/' \
--tgt-options '{file_max_rows: 100000, format: csv}'
# Windows path format
$ sling run --src-conn MY_SOURCE_DB \
--src-stream 'source_schema.source_table' \
--tgt-object 'file://C:/Temp/my_csv_folder/' \
--tgt-options '{file_max_rows: 100000, format: csv}'
Using
Running with Sling: sling run -r /path/to/replication.yaml
source: MY_SOURCE_DB
target: LOCAL
defaults:
target_options:
format: csv
streams:
source_schema.source_table:
object: file:///tmp/my_file.csv
source_schema.source_table1:
object: file:///tmp/my_csv_folder/*.csv
source_schema.source_table2:
object: file:///tmp/my_csv_folder/
target_options:
file_max_rows: 100000
source_schema.source_table3:
object: file://C:/Temp/my_csv_folder/ # Windows Path format
target_options:
file_max_rows: 100000
# all tables in schema, except "forbidden_table"
my_schema.*:
object: file:///tmp/{stream_schema}/{stream_table}/{YYYY}_{MM}_{DD}/
target_options:
file_max_rows: 400000 # will split files into folder
my_schema.forbidden_table:
disabled: true
env:
SLING_THREADS: 3 # run streams concurrently
Using
from sling import Replication, ReplicationStream, TargetOptions, Format
import os
# Set environment variables
os.environ['MY_SOURCE_DB'] = '...'
# Single file export
replication = Replication(
source='MY_SOURCE_DB',
target='LOCAL',
streams={
'source_schema.source_table': ReplicationStream(
object='file:///tmp/my_file.csv'
)
}
)
# Run the replication
replication.run()
# Multiple streams with target options
replication = Replication(
source='MY_SOURCE_DB',
target='LOCAL',
defaults=TargetOptions(format=Format.CSV),
streams={
'source_schema.source_table': ReplicationStream(
object='file:///tmp/my_file.csv'
),
'source_schema.source_table1': ReplicationStream(
object='file:///tmp/my_csv_folder/*.csv'
),
'source_schema.source_table2': ReplicationStream(
object='file:///tmp/my_csv_folder/',
target_options=TargetOptions(file_max_rows=100000)
),
'source_schema.source_table3': ReplicationStream(
object='file://C:/Temp/my_csv_folder/', # Windows Path format
target_options=TargetOptions(file_max_rows=100000)
)
},
env={'SLING_THREADS': '3'} # run streams concurrently
)
replication.run()
# Schema wildcard with disabled stream
replication = Replication(
source='MY_SOURCE_DB',
target='LOCAL',
defaults=TargetOptions(format=Format.CSV),
streams={
'my_schema.*': ReplicationStream(
object='file:///tmp/{stream_schema}/{stream_table}/{YYYY}_{MM}_{DD}/',
target_options=TargetOptions(file_max_rows=400000)
),
'my_schema.forbidden_table': ReplicationStream(
disabled=True
)
},
env={'SLING_THREADS': '3'}
)
replication.run()
Using
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --stdout
$ sling run --src-conn MY_SOURCE_DB \
--src-stream 'source_schema.source_table' \
--tgt-object 'file:///tmp/my_file.json'
$ sling run --src-conn MY_SOURCE_DB \
--src-stream 'source_schema.source_table' \
--tgt-object 'file:///tmp/my_json_folder/*.json'
$ sling run --src-conn MY_SOURCE_DB \
--src-stream 'source_schema.source_table' \
--tgt-object 'file:///tmp/my_json_folder/' \
--tgt-options '{file_max_bytes: 4000000, format: json}'
# Windows Path format
$ sling run --src-conn MY_SOURCE_DB \
--src-stream 'source_schema.source_table' \
--tgt-object 'file://C:/Temp/my_json_folder/' \
--tgt-options '{file_max_bytes: 4000000, format: json}'
Running with Sling: sling run -r /path/to/replication.yaml
source: MY_SOURCE_DB
target: LOCAL
defaults:
target_options:
format: json
streams:
source_schema.source_table:
object: file:///tmp/my_file.json
source_schema.source_table1:
object: file:///tmp/my_json_folder/*.json
source_schema.source_table2:
object: file:///tmp/my_json_folder/
target_options:
file_max_bytes: 4000000
source_schema.source_table3:
object: file://C:/Temp/my_json_folder/
target_options:
file_max_bytes: 4000000
# all tables in schema, except "forbidden_table"
my_schema.*:
object: file:///tmp/{stream_schema}/{stream_table}/{YYYY}_{MM}_{DD}/
target_options:
file_max_rows: 400000 # will split files into folder
my_schema.forbidden_table:
disabled: true
env:
SLING_THREADS: 3 # run streams concurrently
from sling import Replication, ReplicationStream, TargetOptions, Format
import os
# Set environment variables
os.environ['MY_SOURCE_DB'] = '...'
# Single JSON file export
replication = Replication(
source='MY_SOURCE_DB',
target='LOCAL',
streams={
'source_schema.source_table': ReplicationStream(
object='file:///tmp/my_file.json'
)
}
)
# Run the replication
replication.run()
# Multiple streams with target options
replication = Replication(
source='MY_SOURCE_DB',
target='LOCAL',
defaults=TargetOptions(format=Format.JSON),
streams={
'source_schema.source_table': ReplicationStream(
object='file:///tmp/my_file.json'
),
'source_schema.source_table1': ReplicationStream(
object='file:///tmp/my_json_folder/*.json'
),
'source_schema.source_table2': ReplicationStream(
object='file:///tmp/my_json_folder/',
target_options=TargetOptions(file_max_bytes=4000000)
),
'source_schema.source_table3': ReplicationStream(
object='file://C:/Temp/my_json_folder/', # Windows Path format
target_options=TargetOptions(file_max_bytes=4000000)
)
},
env={'SLING_THREADS': '3'} # run streams concurrently
)
replication.run()
# Schema wildcard with disabled stream
replication = Replication(
source='MY_SOURCE_DB',
target='LOCAL',
defaults=TargetOptions(format=Format.JSON),
streams={
'my_schema.*': ReplicationStream(
object='file:///tmp/{stream_schema}/{stream_table}/{YYYY}_{MM}_{DD}/',
target_options=TargetOptions(file_max_rows=400000)
),
'my_schema.forbidden_table': ReplicationStream(
disabled=True
)
},
env={'SLING_THREADS': '3'}
)
replication.run()
$ sling run --src-conn MY_SOURCE_DB \
--src-stream 'source_schema.source_table' \
--tgt-object 'file:///tmp/my_file.jsonl' \
--tgt-options '{format: jsonlines}'
$ sling run --src-conn MY_SOURCE_DB \
--src-stream 'source_schema.source_table' \
--tgt-object 'file:///tmp/my_json_folder/*.jsonl'
$ sling run --src-conn MY_SOURCE_DB \
--src-stream 'source_schema.source_table' \
--tgt-object 'file:///tmp/my_json_folder/' \
--tgt-options '{file_max_bytes: 4000000, format: jsonlines}'
# Windows Path format
$ sling run --src-conn MY_SOURCE_DB \
--src-stream 'source_schema.source_table' \
--tgt-object 'file://C:/Temp/my_json_folder/' \
--tgt-options '{file_max_bytes: 4000000, format: jsonlines}'
Running with Sling: sling run -r /path/to/replication.yaml
source: MY_SOURCE_DB
target: LOCAL
defaults:
target_options:
format: jsonlines
streams:
source_schema.source_table:
object: file:///tmp/my_file.jsonl
source_schema.source_table1:
object: file:///tmp/my_jsonlines_folder/*.jsonl
source_schema.source_table2:
object: file:///tmp/my_jsonlines_folder/
target_options:
file_max_bytes: 4000000
source_schema.source_table3:
object: file://C:/Temp/my_jsonlines_folder/ # Windows Path format
target_options:
file_max_bytes: 4000000
# all tables in schema, except "forbidden_table"
my_schema.*:
object: file:///tmp/{stream_schema}/{stream_table}/{YYYY}_{MM}_{DD}/
target_options:
file_max_rows: 400000 # will split files into folder
my_schema.forbidden_table:
disabled: true
env:
SLING_THREADS: 3 # run streams concurrently
from sling import Replication, ReplicationStream, TargetOptions, Format
import os
# Set environment variables
os.environ['MY_SOURCE_DB'] = '...'
# Single JSON Lines file export
replication = Replication(
source='MY_SOURCE_DB',
target='LOCAL',
streams={
'source_schema.source_table': ReplicationStream(
object='file:///tmp/my_file.jsonl',
target_options=TargetOptions(format=Format.JSONLINES)
)
}
)
# Run the replication
replication.run()
# Multiple streams with target options
replication = Replication(
source='MY_SOURCE_DB',
target='LOCAL',
defaults=TargetOptions(format=Format.JSONLINES),
streams={
'source_schema.source_table': ReplicationStream(
object='file:///tmp/my_file.jsonl'
),
'source_schema.source_table1': ReplicationStream(
object='file:///tmp/my_jsonlines_folder/*.jsonl'
),
'source_schema.source_table2': ReplicationStream(
object='file:///tmp/my_jsonlines_folder/',
target_options=TargetOptions(file_max_bytes=4000000)
),
'source_schema.source_table3': ReplicationStream(
object='file://C:/Temp/my_jsonlines_folder/', # Windows Path format
target_options=TargetOptions(file_max_bytes=4000000)
)
},
env={'SLING_THREADS': '3'} # run streams concurrently
)
replication.run()
# Schema wildcard with disabled stream
replication = Replication(
source='MY_SOURCE_DB',
target='LOCAL',
defaults=TargetOptions(format=Format.JSONLINES),
streams={
'my_schema.*': ReplicationStream(
object='file:///tmp/{stream_schema}/{stream_table}/{YYYY}_{MM}_{DD}/',
target_options=TargetOptions(file_max_rows=400000)
),
'my_schema.forbidden_table': ReplicationStream(
disabled=True
)
},
env={'SLING_THREADS': '3'}
)
replication.run()
$ sling run --src-conn MY_SOURCE_DB \
--src-stream 'source_schema.source_table' \
--tgt-object 'file:///tmp/my_file.parquet'
$ sling run --src-conn MY_SOURCE_DB \
--src-stream 'source_schema.source_table' \
--tgt-object 'file:///tmp/my_parquet_folder/*.parquet'
$ sling run --src-conn MY_SOURCE_DB \
--src-stream 'source_schema.source_table' \
--tgt-object 'file:///tmp/my_parquet_folder/' \
--tgt-options '{file_max_rows: 4000000, format: parquet}'
# Windows Path format
$ sling run --src-conn MY_SOURCE_DB \
--src-stream 'source_schema.source_table' \
--tgt-object 'file://C:/Temp/my_parquet_folder/' \
--tgt-options '{file_max_rows: 4000000, format: parquet}'
Running with Sling: sling run -r /path/to/replication.yaml
source: MY_SOURCE_DB
target: LOCAL
defaults:
target_options:
format: parquet
streams:
source_schema.source_table:
object: file://C:/Temp/my_file.parquet # Windows Path format
source_schema.source_table1:
object: file://C:/Temp/my_parquet_folder/*.parquet # Windows Path format
source_schema.source_table2:
object: file:///tmp/my_parquet_folder/
target_options:
file_max_rows: 1000000
# all tables in schema, except "forbidden_table"
my_schema.*:
object: file:///tmp/{stream_schema}/{stream_table}/{YYYY}_{MM}_{DD}/
target_options:
file_max_rows: 400000 # will split files into folder
my_schema.forbidden_table:
disabled: true
env:
SLING_THREADS: 3 # run streams concurrently
from sling import Replication, ReplicationStream, TargetOptions, Format
import os
# Set environment variables
os.environ['MY_SOURCE_DB'] = '...'
# Single Parquet file export
replication = Replication(
source='MY_SOURCE_DB',
target='LOCAL',
streams={
'source_schema.source_table': ReplicationStream(
object='file:///tmp/my_file.parquet'
)
}
)
# Run the replication
replication.run()
# Multiple streams with target options
replication = Replication(
source='MY_SOURCE_DB',
target='LOCAL',
defaults=TargetOptions(format=Format.PARQUET),
streams={
'source_schema.source_table': ReplicationStream(
object='file://C:/Temp/my_file.parquet' # Windows Path format
),
'source_schema.source_table1': ReplicationStream(
object='file://C:/Temp/my_parquet_folder/*.parquet' # Windows Path format
),
'source_schema.source_table2': ReplicationStream(
object='file:///tmp/my_parquet_folder/',
target_options=TargetOptions(file_max_rows=1000000)
)
},
env={'SLING_THREADS': '3'} # run streams concurrently
)
replication.run()
# Schema wildcard with disabled stream
replication = Replication(
source='MY_SOURCE_DB',
target='LOCAL',
defaults=TargetOptions(format=Format.PARQUET),
streams={
'my_schema.*': ReplicationStream(
object='file:///tmp/{stream_schema}/{stream_table}/{YYYY}_{MM}_{DD}/',
target_options=TargetOptions(file_max_rows=400000)
),
'my_schema.forbidden_table': ReplicationStream(
disabled=True
)
},
env={'SLING_THREADS': '3'}
)
replication.run()
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_S3_BUCKET --tgt-object 's3://my-bucket/my_file.csv'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_S3_BUCKET --tgt-object 's3://my-bucket/my_csv_folder/*.csv'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_S3_BUCKET --tgt-object 's3://my-bucket/my_csv_folder/' --tgt-options '{file_max_rows: 100000, format: csv}'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_GS_BUCKET --tgt-object 'gs://my-bucket/my_file.csv'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_GS_BUCKET --tgt-object 'gs://my-bucket/my_csv_folder/*.csv'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_GS_BUCKET --tgt-object 'gs://my-bucket/my_csv_folder/' --tgt-options '{file_max_rows: 100000, format: csv}'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_AZURE_CONT --tgt-object 'https://my_account.blob.core.windows.net/my-container/my_file.csv'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_AZURE_CONT --tgt-object 'https://my_account.blob.core.windows.net/my-container/my_csv_folder/*.csv'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_AZURE_CONT --tgt-object 'https://my_account.blob.core.windows.net/my-container/my_csv_folder/' --tgt-options '{file_max_rows: 100000, format: csv}'
Running with Sling: sling run -r /path/to/replication.yaml
source: MY_SOURCE_DB
target: MY_CLOUD_STORAGE
defaults:
object: {stream_schema}/{stream_table}/{YYYY}_{MM}_{DD}.csv.gz
target_options:
format: csv
compression: gzip
streams:
# all tables in schema
my_schema.*:
object: {stream_schema}/{stream_table}/{YYYY}_{MM}_{DD}/*.csv
target_options:
file_max_rows: 400000 # will split files into folder
other_schema.source_table: # will use defaults
env:
SLING_THREADS: 3 # run streams concurrently
from sling import Replication, ReplicationStream, TargetOptions, Format, Compression
import os
# Set environment variables
os.environ['MY_SOURCE_DB'] = '...'
os.environ['MY_CLOUD_STORAGE'] = '...'
# Cloud storage export with defaults
replication = Replication(
source='MY_SOURCE_DB',
target='MY_CLOUD_STORAGE',
defaults={
'object': '{stream_schema}/{stream_table}/{YYYY}_{MM}_{DD}.csv.gz',
'target_options': TargetOptions(
format=Format.CSV,
compression=Compression.GZIP
)
},
streams={
# all tables in schema
'my_schema.*': ReplicationStream(
object='{stream_schema}/{stream_table}/{YYYY}_{MM}_{DD}/*.csv',
target_options=TargetOptions(file_max_rows=400000)
),
'other_schema.source_table': {} # will use defaults
},
env={'SLING_THREADS': '3'} # run streams concurrently
)
replication.run()
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_S3_BUCKET --tgt-object 's3://my-bucket/my_file.json'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_S3_BUCKET --tgt-object 's3://my-bucket/my_json_folder/*.json'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_S3_BUCKET --tgt-object 's3://my-bucket/my_json_folder/' --tgt-options '{file_max_rows: 100000, format: json}'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_GS_BUCKET --tgt-object 'gs://my-bucket/my_file.json'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_GS_BUCKET --tgt-object 'gs://my-bucket/my_json_folder/*.json'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_GS_BUCKET --tgt-object 'gs://my-bucket/my_json_folder/' --tgt-options '{file_max_rows: 100000, format: json}'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_AZURE_CONT --tgt-object 'https://my_account.blob.core.windows.net/my-container/my_file.json'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_AZURE_CONT --tgt-object 'https://my_account.blob.core.windows.net/my-container/my_json_folder/*.json'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_AZURE_CONT --tgt-object 'https://my_account.blob.core.windows.net/my-container/my_json_folder/' --tgt-options '{file_max_rows: 100000, format: json}'
Running with Sling: sling run -r /path/to/replication.yaml
source: MY_SOURCE_DB
target: MY_CLOUD_STORAGE
defaults:
object: {stream_schema}/{stream_table}/{YYYY}_{MM}_{DD}.json.gz
target_options:
format: json
compression: gzip
streams:
# all tables in schema
my_schema.*:
object: {stream_schema}/{stream_table}/{YYYY}_{MM}_{DD}/*.json
target_options:
file_max_rows: 400000 # will split files into folder
other_schema.source_table: # will use defaults
from sling import Replication, ReplicationStream, TargetOptions, Format, Compression
import os
# Set environment variables
os.environ['MY_SOURCE_DB'] = '...'
os.environ['MY_CLOUD_STORAGE'] = '...'
# Cloud storage JSON export with defaults
replication = Replication(
source='MY_SOURCE_DB',
target='MY_CLOUD_STORAGE',
defaults={
'object': '{stream_schema}/{stream_table}/{YYYY}_{MM}_{DD}.json.gz',
'target_options': TargetOptions(
format=Format.JSON,
compression=Compression.GZIP
)
},
streams={
# all tables in schema
'my_schema.*': ReplicationStream(
object='{stream_schema}/{stream_table}/{YYYY}_{MM}_{DD}/*.json',
target_options=TargetOptions(file_max_rows=400000)
),
'other_schema.source_table': {} # will use defaults
}
)
replication.run()
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_S3_BUCKET --tgt-object 's3://my-bucket/my_file.jsonl'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_S3_BUCKET --tgt-object 's3://my-bucket/my_json_folder/*.jsonl'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_S3_BUCKET --tgt-object 's3://my-bucket/my_json_folder/' --tgt-options '{file_max_rows: 100000, format: jsonlines}'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_GS_BUCKET --tgt-object 'gs://my-bucket/my_file.jsonl'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_GS_BUCKET --tgt-object 'gs://my-bucket/my_json_folder/*.jsonl'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_GS_BUCKET --tgt-object 'gs://my-bucket/my_json_folder/' --tgt-options '{file_max_rows: 100000, format: jsonlines}'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_AZURE_CONT --tgt-object 'https://my_account.blob.core.windows.net/my-container/my_file.jsonl'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_AZURE_CONT --tgt-object 'https://my_account.blob.core.windows.net/my-container/my_json_folder/*.jsonl'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_AZURE_CONT --tgt-object 'https://my_account.blob.core.windows.net/my-container/my_json_folder/' --tgt-options '{file_max_rows: 100000, format: jsonlines}'
Running with Sling: sling run -r /path/to/replication.yaml
source: MY_SOURCE_DB
target: MY_CLOUD_STORAGE
defaults:
object: {stream_schema}/{stream_table}/{YYYY}_{MM}_{DD}.jsonl.gz
target_options:
format: jsonlines
compression: gzip
streams:
# all tables in schema, except "forbidden_table"
my_schema.*:
object: {stream_schema}/{stream_table}/{YYYY}_{MM}_{DD}/*.jsonl
target_options:
file_max_rows: 400000 # will split files into folder
my_schema.forbidden_table:
disabled: true
other_schema.source_table: # will use defaults
env:
SLING_THREADS: 3 # run streams concurrently
from sling import Replication, ReplicationStream, TargetOptions, Format, Compression
import os
# Set environment variables
os.environ['MY_SOURCE_DB'] = '...'
os.environ['MY_CLOUD_STORAGE'] = '...'
# Cloud storage JSON Lines export with defaults
replication = Replication(
source='MY_SOURCE_DB',
target='MY_CLOUD_STORAGE',
defaults={
'object': '{stream_schema}/{stream_table}/{YYYY}_{MM}_{DD}.jsonl.gz',
'target_options': TargetOptions(
format=Format.JSONLINES,
compression=Compression.GZIP
)
},
streams={
# all tables in schema, except "forbidden_table"
'my_schema.*': ReplicationStream(
object='{stream_schema}/{stream_table}/{YYYY}_{MM}_{DD}/*.jsonl',
target_options=TargetOptions(file_max_rows=400000)
),
'my_schema.forbidden_table': ReplicationStream(
disabled=True
),
'other_schema.source_table': {} # will use defaults
},
env={'SLING_THREADS': '3'} # run streams concurrently
)
replication.run()
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_S3_BUCKET --tgt-object 's3://my-bucket/my_file.parquet'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_S3_BUCKET --tgt-object 's3://my-bucket/my_parquet_folder/*.parquet'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_S3_BUCKET --tgt-object 's3://my-bucket/my_parquet_folder/' --tgt-options '{file_max_rows: 100000, format: parquet}'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_GS_BUCKET --tgt-object 'gs://my-bucket/my_file.parquet'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_GS_BUCKET --tgt-object 'gs://my-bucket/my_parquet_folder/*.parquet'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_GS_BUCKET --tgt-object 'gs://my-bucket/my_parquet_folder/' --tgt-options '{file_max_rows: 100000, format: parquet}'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_AZURE_CONT --tgt-object 'https://my_account.blob.core.windows.net/my-container/my_file.parquet'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_AZURE_CONT --tgt-object 'https://my_account.blob.core.windows.net/my-container/my_parquet_folder/*.parquet'
$ sling run --src-conn MY_SOURCE_DB --src-stream 'source_schema.source_table' --tgt-conn MY_AZURE_CONT --tgt-object 'https://my_account.blob.core.windows.net/my-container/my_parquet_folder/' --tgt-options '{file_max_rows: 100000, format: parquet}'
Running with Sling: sling run -r /path/to/replication.yaml
source: MY_SOURCE_DB
target: MY_CLOUD_STORAGE
defaults:
object: {stream_schema}/{stream_table}/{YYYY}_{MM}_{DD}.parquet
target_options:
format: parquet
streams:
# all tables in schema, except "forbidden_table"
my_schema.*:
object: {stream_schema}/{stream_table}/{YYYY}_{MM}_{DD}/*.parquet
target_options:
file_max_rows: 400000 # will split files into folder
my_schema.forbidden_table:
disabled: true
other_schema.source_table: # will use defaults
env:
SLING_THREADS: 3 # run streams concurrently
from sling import Replication, ReplicationStream, TargetOptions, Format
import os
# Set environment variables
os.environ['MY_SOURCE_DB'] = '...'
os.environ['MY_CLOUD_STORAGE'] = '...'
# Cloud storage Parquet export with defaults
replication = Replication(
source='MY_SOURCE_DB',
target='MY_CLOUD_STORAGE',
defaults={
'object': '{stream_schema}/{stream_table}/{YYYY}_{MM}_{DD}.parquet',
'target_options': TargetOptions(format=Format.PARQUET)
},
streams={
# all tables in schema, except "forbidden_table"
'my_schema.*': ReplicationStream(
object='{stream_schema}/{stream_table}/{YYYY}_{MM}_{DD}/*.parquet',
target_options=TargetOptions(file_max_rows=400000)
),
'my_schema.forbidden_table': ReplicationStream(
disabled=True
),
'other_schema.source_table': {} # will use defaults
},
env={'SLING_THREADS': '3'} # run streams concurrently
)
replication.run()
Using
Using
Using
Using
Using
Using
See also .
Using
Using
Using
Using
Using
Using
Using
Using
Using
Using
Using
Using
See also .
Using
Using
Using