Here is example of data model for load info about files and their references to sites from where those files are downloaded.
We try to optimize load speed and play with number of client connections, buffer size and batch size.
What will be optimal parameters for this use case.
Nebula cluster is 4 nodes with 64 cores, 256GB RAM, 4 x 3.5TB Nvme disks
Nebula database version is 1.2.0
Here are also some script definitions and some simple template trick to generate scripts for parallel or sequential load for large dump files to be able to control velocity and be able to continue with load when something breaks.
NebulaDB Data Model
CREATE SPACE IF NOT EXISTS fgraph(partition_num=128, replica_factor=3);
USE fgraph;
CREATE TAG info(sha1 string, record_timestamp int);
CREATE TAG file(file_name string, file_size int);
CREATE TAG domain(source_domain string);
CREATE EDGE reference();
CREATE TAG INDEX source_domain_idx ON domain(source_domain);
CREATE TAG INDEX info_timestamp_idx ON info(record_timestamp);
files-template.yaml
version: v1
description: Files info and size - Template
removeTempFiles: false
clientSettings:
retry: 5
concurrency: 16
channelBufferSize: 1024
space: fgraph
connection:
user: root
password: nebula
address: alt-nebuladb01:9669,alt-nebuladb02:9669,alt-nebuladb03:9669,alt-nebuladb04:9669
afterPeriod: 3s
logPath: /data01/dumps/err/files-XY.log
files:
- path: /data01/dumps/files/files-XY.csv
failDataPath: /data01/dumps/err/files-XY.csv
batchSize: 32
inOrder: false
type: csv
csv:
withHeader: false
withLabel: false
delimiter: ","
schema:
type: vertex
vertex:
vid:
index: 0
tags:
- name: info
props:
- name: sha1
type: string
index: 1
- name: record_timestamp
type: int
index: 2
- name: file
props:
- name: file_name
type: string
index: 3
- name: file_size
type: int
index: 4
for id in `seq 0 255`; do cat /data01/dumps/scripts/files-template.yaml | sed s/XY/$( printf "%02x" $id )/ > /data01/dumps/scripts/files-$( printf "%02x" $id ).yaml ; done
for id in `seq 0 255`; do /home/nebula/nebula-importer/nebula-importer --config /data01/dumps/scripts/files-$( printf "%02x" $id ).yaml ; sleep 10 ; done
domains.yaml
version: v1
description: Domains
removeTempFiles: false
clientSettings:
retry: 5
concurrency: 16
channelBufferSize: 1024
space: fgraph
connection:
user: root
password: nebula
address: alt-nebuladb01:9669,alt-nebuladb02:9669,alt-nebuladb03:9669,alt-nebuladb04:9669
afterPeriod: 3s
logPath: /data01/dumps/err/domains.log
files:
- path: /data01/dumps/domains/domains.csv
failDataPath: /data01/dumps/err/domains.csv
batchSize: 16
inOrder: false
type: csv
csv:
withHeader: false
withLabel: false
delimiter: ","
schema:
type: vertex
vertex:
vid:
index: 0
tags:
- name: domain
props:
- name: source_domain
type: string
index: 1
`/home/nebula/nebula-importer/nebula-importer --config /data01/dumps/scripts/domains.yaml`
references-template.yaml
version: v1
description: References - Template
removeTempFiles: false
clientSettings:
retry: 5
concurrency: 16
channelBufferSize: 1024
space: fgraph
connection:
user: root
password: nebula
address: alt-nebuladb01:9669,alt-nebuladb02:9669,alt-nebuladb03:9669,alt-nebuladb04:9669
afterPeriod: 3s
logPath: /data01/dumps/err/references-X.log
files:
- path: /data01/dumps/domains/references-X.csv
failDataPath: /data01/dumps/err/references-X.csv
batchSize: 32
inOrder: false
type: csv
csv:
withHeader: false
withLabel: false
delimiter: ","
schema:
type: edge
edge:
name: reference
withRanking: false
srcVID:
index: 0
dstVID:
index: 1
for id in `seq 0 15`; do cat /data01/dumps/scripts/references-template.yaml | sed s/X/$( printf "%1x" $id )/ > /data01/dumps/scripts/references-$( printf "%1x" $id ).yaml ; done
for id in `seq 0 15`; do /home/nebula/nebula-importer/nebula-importer --config /data01/dumps/scripts/references-$( printf "%1x" $id ).yaml ; sleep 10 ; done