@mrjn This time I tried running the bulk loader (on a r4.8xlarge EC2 instance) as follows:
$ dgraph bulk --jsons=dgraph_mutations/ --reduce_shards=1 --map_shards=1 --schema_file=core_schema.txt --out=live --mapoutput_mb=1 --num_go_routines=4
Here is how my data directory looks like:
$ du -ch data/dgraph_mutations
14G data/dgraph_mutations/dgraph_mutations
14G data/dgraph_mutations
14G total
After sometime it crashed as follows:
MAP 17m33s nquad_count:278.9M err_count:0.000 nquad_speed:264.8k/sec edge_count:613.4M edge_speed:582.5k/sec
MAP 17m34s nquad_count:279.1M err_count:0.000 nquad_speed:264.8k/sec edge_count:613.9M edge_speed:582.4k/sec
fatal error: runtime: cannot allocate memory
fatal error: runtime: cannot allocate memory
fatal error: runtime: cannot allocate memory
fatal error: runtime: cannot allocate memory
runtime stack:
runtime.throw(0x146eac0, 0x1f)
/usr/local/go/src/runtime/panic.go:608 +0x72
runtime.newArenaMayUnlock(0x1f75220)
/usr/local/go/src/runtime/mheap.go:1668 +0xda
runtime.newMarkBits(0x33, 0x16)
/usr/local/go/src/runtime/mheap.go:1588 +0xb3
runtime.heapBits.initSpan(0x7fa8a45bd200, 0x20317600000000, 0x7fa8a477ffff, 0x7fa4ade9f690)
/usr/local/go/src/runtime/mbitmap.go:765 +0x74
runtime.(*mcentral).grow(0x1f7df38, 0x0)
/usr/local/go/src/runtime/mcentral.go:240 +0x12a
runtime.(*mcentral).cacheSpan(0x1f7df38, 0x7fa4ade9f4c8)
/usr/local/go/src/runtime/mcentral.go:106 +0x2f8
runtime.(*mcache).refill(0x7faa5efb3700, 0x7fa563534116) [361/1915]
/usr/local/go/src/runtime/mcache.go:122 +0x95
runtime.(*mcache).nextFree.func1()
/usr/local/go/src/runtime/malloc.go:749 +0x32
runtime.systemstack(0xca37468180)
/usr/local/go/src/runtime/asm_amd64.s:351 +0x66
runtime.mstart()
/usr/local/go/src/runtime/proc.go:1229
goroutine 381032 [running]:
runtime.systemstack_switch()
/usr/local/go/src/runtime/asm_amd64.s:311 fp=0xe164216b80 sp=0xe164216b78 pc=0x8d6450
runtime.(*mcache).nextFree(0x7faa5efb3700, 0xc5d879e416, 0x0, 0xca37462000, 0x1)
/usr/local/go/src/runtime/malloc.go:748 +0xb6 fp=0xe164216bd8 sp=0xe164216b80 pc=0x886c96
runtime.mallocgc(0xa0, 0x1416680, 0x1, 0xc5d879e420)
/usr/local/go/src/runtime/malloc.go:903 +0x793 fp=0xe164216c78 sp=0xe164216bd8 pc=0x8875e3
runtime.newobject(0x1416680, 0xf5e82c3979)
/usr/local/go/src/runtime/malloc.go:1032 +0x38 fp=0xe164216ca8 sp=0xe164216c78 pc=0x8879c8
github.com/dgraph-io/dgraph/protos/pb.(*MapEntry).Unmarshal(0xc5d87993b0, 0xf5e82c3977, 0x2e, 0x28770e689, 0x1, 0x7fa563
534000)
/ext-go/1/src/github.com/dgraph-io/dgraph/protos/pb/pb.pb.go:14783 +0x655 fp=0xe164216da0 sp=0xe164216ca8 pc=0xd
3d2b5
github.com/dgraph-io/dgraph/protos/pb.(*MapEntry).XXX_Unmarshal(0xc5d87993b0, 0xf5e82c3977, 0x2e, 0x28770e689, 0xc5d8799
3b0, 0xe164216e01)
/ext-go/1/src/github.com/dgraph-io/dgraph/protos/pb/pb.pb.go:2835 +0x49 fp=0xe164216de0 sp=0xe164216da0 pc=0xcfb
259
github.com/dgraph-io/dgraph/vendor/github.com/gogo/protobuf/proto.Unmarshal(0xf5e82c3977, 0x2e, 0x28770e689, 0x1587ba0,
0xc5d87993b0, 0x0, 0x0)
/ext-go/1/src/github.com/dgraph-io/dgraph/vendor/github.com/gogo/protobuf/proto/decode.go:338 +0x1c2 fp=0xe16421
6e48 sp=0xe164216de0 pc=0xfea322
github.com/dgraph-io/dgraph/dgraph/cmd/bulk.(*mapper).writeMapEntriesToFile(0xc0001b4940, 0xf50f9d2000, 0x220065a30, 0x3
60000000, 0x0)
/ext-go/1/src/github.com/dgraph-io/dgraph/dgraph/cmd/bulk/mapper.go:89 +0x1b7 fp=0xe164216fb8 sp=0xe164216e48 pc
=0x1170d77
runtime.goexit()
/usr/local/go/src/runtime/asm_amd64.s:1333 +0x1 fp=0xe164216fc0 sp=0xe164216fb8 pc=0x8d8531 [326/1915]
created by github.com/dgraph-io/dgraph/dgraph/cmd/bulk.(*mapper).run
/ext-go/1/src/github.com/dgraph-io/dgraph/dgraph/cmd/bulk/mapper.go:150 +0x258
goroutine 1 [semacquire, 14 minutes]:
sync.runtime_Semacquire(0xc000034238)
/usr/local/go/src/runtime/sema.go:56 +0x39
sync.(*WaitGroup).Wait(0xc000034230)
/usr/local/go/src/sync/waitgroup.go:130 +0x64
github.com/dgraph-io/dgraph/dgraph/cmd/bulk.(*loader).mapStage(0xc0004cb2c0)
/ext-go/1/src/github.com/dgraph-io/dgraph/dgraph/cmd/bulk/loader.go:255 +0xa90
github.com/dgraph-io/dgraph/dgraph/cmd/bulk.run()
/ext-go/1/src/github.com/dgraph-io/dgraph/dgraph/cmd/bulk/run.go:183 +0xb3f
github.com/dgraph-io/dgraph/dgraph/cmd/bulk.init.0.func1(0xc000103200, 0xc0001b8fc0, 0x0, 0x7)
/ext-go/1/src/github.com/dgraph-io/dgraph/dgraph/cmd/bulk/run.go:44 +0x52
github.com/dgraph-io/dgraph/vendor/github.com/spf13/cobra.(*Command).execute(0xc000103200, 0xc0001b8e00, 0x7, 0x7, 0xc00
0103200, 0xc0001b8e00)
/ext-go/1/src/github.com/dgraph-io/dgraph/vendor/github.com/spf13/cobra/command.go:702 +0x2d3
github.com/dgraph-io/dgraph/vendor/github.com/spf13/cobra.(*Command).ExecuteC(0x1ecd680, 0x8, 0x0, 0x0)
/ext-go/1/src/github.com/dgraph-io/dgraph/vendor/github.com/spf13/cobra/command.go:783 +0x2dc
github.com/dgraph-io/dgraph/vendor/github.com/spf13/cobra.(*Command).Execute(0x1ecd680, 0x8ac98d, 0x1ed5868)
/ext-go/1/src/github.com/dgraph-io/dgraph/vendor/github.com/spf13/cobra/command.go:736 +0x2b
github.com/dgraph-io/dgraph/dgraph/cmd.Execute()
/ext-go/1/src/github.com/dgraph-io/dgraph/dgraph/cmd/root.go:58 +0x36
main.main()
/ext-go/1/src/github.com/dgraph-io/dgraph/dgraph/main.go:33 +0x88
goroutine 5 [chan receive]:
github.com/dgraph-io/dgraph/vendor/github.com/golang/glog.(*loggingT).flushDaemon(0x1f763c0)
/ext-go/1/src/github.com/dgraph-io/dgraph/vendor/github.com/golang/glog/glog.go:882 +0x8b
created by github.com/dgraph-io/dgraph/vendor/github.com/golang/glog.init.0
/ext-go/1/src/github.com/dgraph-io/dgraph/vendor/github.com/golang/glog/glog.go:410 +0x203
goroutine 19 [syscall, 17 minutes]:
os/signal.signal_recv(0x14a6a38)
/usr/local/go/src/runtime/sigqueue.go:139 +0x9c
os/signal.loop()
/usr/local/go/src/os/signal/signal_unix.go:23 +0x22
created by os/signal.init.0
/usr/local/go/src/os/signal/signal_unix.go:29 +0x41
goroutine 20 [chan receive]:
github.com/dgraph-io/dgraph/x.init.1.func1()
/ext-go/1/src/github.com/dgraph-io/dgraph/x/metrics.go:95 +0x7e
created by github.com/dgraph-io/dgraph/x.init.1
/ext-go/1/src/github.com/dgraph-io/dgraph/x/metrics.go:90 +0x52c
goroutine 25 [select]:
go.opencensus.io/stats/view.(*worker).start(0xc0001ecac0)
/go/src/go.opencensus.io/stats/view/worker.go:152 +0xdd
created by go.opencensus.io/stats/view.init.0
/go/src/go.opencensus.io/stats/view/worker.go:29 +0x57
goroutine 26 [chan receive]:
github.com/dgraph-io/dgraph/worker.(*rateLimiter).bleed(0x1f93f0c)
/ext-go/1/src/github.com/dgraph-io/dgraph/worker/proposal.go:61 +0x95
created by github.com/dgraph-io/dgraph/worker.init.0
/ext-go/1/src/github.com/dgraph-io/dgraph/worker/proposal.go:46 +0x41
...
I am not pasting the whole stack trace above but hopefully the above is enough to give you an idea. Also, I took the heap profile (a few minutes before the crash) and here is how it looks like:
$ go tool pprof pprof.dgraph.alloc_objects.alloc_space.inuse_objects.inuse_space.004.pb
(pprof) top
Showing nodes accounting for 119.39GB, 98.64% of 121.04GB total
Dropped 104 nodes (cum <= 0.61GB)
Showing top 10 nodes out of 36
flat flat% sum% cum cum%
34.66GB 28.64% 28.64% 42.05GB 34.74% github.com/dgraph-io/dgraph/edgraph.mapToNquads
20.25GB 16.73% 45.37% 20.25GB 16.73% github.com/dgraph-io/dgraph/x.ReserveCap
17.14GB 14.16% 59.52% 18.06GB 14.92% github.com/dgraph-io/dgraph/protos/pb.(*MapEntry).Unmarshal
14GB 11.57% 71.09% 14GB 11.57% bytes.makeSlice
11.43GB 9.44% 80.53% 29.49GB 24.36% github.com/dgraph-io/dgraph/dgraph/cmd/bulk.(*mapper).writeMapEntriesToF
ile
7.39GB 6.10% 86.64% 7.39GB 6.10% github.com/dgraph-io/dgraph/edgraph.handleBasicType
6.07GB 5.02% 91.65% 12.51GB 10.33% encoding/json.(*decodeState).objectInterface
5.82GB 4.81% 96.46% 5.82GB 4.81% encoding/json.unquote
1.71GB 1.41% 97.87% 56.57GB 46.74% github.com/dgraph-io/dgraph/dgraph/cmd/bulk.jsonChunker.parse
0.92GB 0.76% 98.64% 0.92GB 0.76% github.com/dgraph-io/dgraph/protos/pb.(*Posting).Unmarshal
The portal did not allow me to upload the pb file. Let me know if you need anymore info.