Did anyone run the benchmark on multi-node (without gpu)?
The detailed information:
centOS 7.2
git checkout r1.3
bazel build --config=mkl --copt=-DEIGEN_USE_VML -s -c opt //tensorflow/tools/pip_package:build_pip_package
Used commands are as fellows:
python tf_cnn_benchmarks.py --local_parameter_device=cpu --batch_size=32 --model=alexnet --variable_update=distributed_replicated --job_name=worker --ps_hosts=192.192.1.1:50000 --worker_hosts=192.192.1.1:50001 --task_index=0
python tf_cnn_benchmarks.py --local_parameter_device=cpu --batch_size=32 --model=alexnet --variable_update=distributed_replicated --job_name=ps --ps_hosts=192.192.1.1:50000 --worker_hosts=192.192.1.1:50001 --task_index=0
-- PS message --
Running parameter server 0
-- Worker error message --
Running warm up
Exception in thread Thread-1:
Traceback (most recent call last):
File "/usr/lib64/python2.7/threading.py", line 811, in __bootstrap_inner
self.run()
File "tf_cnn_benchmarks.py", line 232, in run
global_step_val, = self.sess.run([self.global_step_op])
File "/usr/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 895, in run
run_metadata_ptr)
File "/usr/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 1051, in _run
raise RuntimeError('Attempted to use a closed Session.')
RuntimeError: Attempted to use a closed Session.
Traceback (most recent call last):
File "tf_cnn_benchmarks.py", line 1345, in
tf.app.run()
File "/usr/lib/python2.7/site-packages/tensorflow/python/platform/app.py", line 48, in run
_sys.exit(main(_sys.argv[:1] + flags_passthrough))
File "tf_cnn_benchmarks.py", line 1341, in main
bench.run()
File "tf_cnn_benchmarks.py", line 884, in run
self._benchmark_cnn()
File "tf_cnn_benchmarks.py", line 1026, in _benchmark_cnn
self.trace_filename, fetch_summary)
File "tf_cnn_benchmarks.py", line 660, in benchmark_one_step
results = sess.run(fetches, options=run_options, run_metadata=run_metadata)
File "/usr/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 895, in run
run_metadata_ptr)
File "/usr/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 1124, in _run
feed_dict_tensor, options, run_metadata)
File "/usr/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 1321, in _do_run
options, run_metadata)
File "/usr/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 1340, in _do_call
raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.InvalidArgumentError: tensor_in must be 4-dimensional
[[Node: v0/tower_0/gradients/v0/tower_0/mpool2/MaxPool_grad/MaxPoolGrad = _MklMaxPoolGrad[T=DT_FLOAT, _kernel="MklOp", data_format="NCHW", ksize=[1, 1, 3, 3], padding="VALID", strides=[1, 1, 2, 2], workspace_enabled=true, _device="/job:worker/replica:0/task:0/cpu:0"](v0/tower_0/conv4/Relu, v0/tower_0/mpool2/MaxPool, v0/tower_0/gradients/v0/tower_0/Reshape_grad/Reshape, v0/tower_0/mpool2/MaxPool:1, DMT/_57, DMT/_58, v0/tower_0/gradients/v0/tower_0/Reshape_grad/Reshape:1, v0/tower_0/mpool2/MaxPool:3)]]
Caused by op u'v0/tower_0/gradients/v0/tower_0/mpool2/MaxPool_grad/MaxPoolGrad', defined at:
File "tf_cnn_benchmarks.py", line 1345, in
tf.app.run()
File "/usr/lib/python2.7/site-packages/tensorflow/python/platform/app.py", line 48, in run
_sys.exit(main(_sys.argv[:1] + flags_passthrough))
File "tf_cnn_benchmarks.py", line 1341, in main
bench.run()
File "tf_cnn_benchmarks.py", line 884, in run
self._benchmark_cnn()
File "tf_cnn_benchmarks.py", line 924, in _benchmark_cnn
(enqueue_ops, fetches) = self._build_model()
File "tf_cnn_benchmarks.py", line 1095, in _build_model
gpu_grad_stage_ops)
File "tf_cnn_benchmarks.py", line 1262, in add_forward_pass_and_gradients
grads = tf.gradients(loss, params, aggregation_method=aggmeth)
File "/usr/lib/python2.7/site-packages/tensorflow/python/ops/gradients_impl.py", line 542, in gradients
grad_scope, op, func_call, lambda: grad_fn(op, *out_grads))
File "/usr/lib/python2.7/site-packages/tensorflow/python/ops/gradients_impl.py", line 348, in _MaybeCompile
return grad_fn() # Exit early
File "/usr/lib/python2.7/site-packages/tensorflow/python/ops/gradients_impl.py", line 542, in
grad_scope, op, func_call, lambda: grad_fn(op, *out_grads))
File "/usr/lib/python2.7/site-packages/tensorflow/python/ops/nn_grad.py", line 526, in _MaxPoolGrad
data_format=op.get_attr("data_format"))
File "/usr/lib/python2.7/site-packages/tensorflow/python/ops/gen_nn_ops.py", line 1754, in _max_pool_grad
data_format=data_format, name=name)
File "/usr/lib/python2.7/site-packages/tensorflow/python/framework/op_def_library.py", line 767, in apply_op
op_def=op_def)
File "/usr/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 2628, in create_op
original_op=self._default_original_op, op_def=op_def)
File "/usr/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 1204, in init
self._traceback = self._graph._extract_stack() # pylint: disable=protected-access
...which was originally created as op u'v0/tower_0/mpool2/MaxPool', defined at:
File "tf_cnn_benchmarks.py", line 1345, in
tf.app.run()
[elided 4 identical lines from previous traceback]
File "tf_cnn_benchmarks.py", line 1095, in _build_model
gpu_grad_stage_ops)
File "tf_cnn_benchmarks.py", line 1245, in add_forward_pass_and_gradients
self.model_conf.add_inference(network)
File "/home/tina/tensorflow/benchmarks/scripts/tf_cnn_benchmarks/alexnet_model.py", line 42, in add_inference
cnn.mpool(3, 3, 2, 2)
File "tf_cnn_benchmarks.py", line 372, in mpool
name=name)
File "/usr/lib/python2.7/site-packages/tensorflow/python/layers/pooling.py", line 426, in max_pooling2d
return layer.apply(inputs)
File "/usr/lib/python2.7/site-packages/tensorflow/python/layers/base.py", line 503, in apply
return self.call(inputs, *args, **kwargs)
File "/usr/lib/python2.7/site-packages/tensorflow/python/layers/base.py", line 450, in call
outputs = self.call(inputs, *args, **kwargs)
File "/usr/lib/python2.7/site-packages/tensorflow/python/layers/pooling.py", line 276, in call
data_format=utils.convert_data_format(self.data_format, 4))
File "/usr/lib/python2.7/site-packages/tensorflow/python/ops/nn_ops.py", line 1772, in max_pool
name=name)
File "/usr/lib/python2.7/site-packages/tensorflow/python/ops/gen_nn_ops.py", line 1607, in _max_pool
data_format=data_format, name=name)
InvalidArgumentError (see above for traceback): tensor_in must be 4-dimensional
[[Node: v0/tower_0/gradients/v0/tower_0/mpool2/MaxPool_grad/MaxPoolGrad = _MklMaxPoolGrad[T=DT_FLOAT, _kernel="MklOp", data_format="NCHW", ksize=[1, 1, 3, 3], padding="VALID", strides=[1, 1, 2, 2], workspace_enabled=true, _device="/job:worker/replica:0/task:0/cpu:0"](v0/tower_0/conv4/Relu, v0/tower_0/mpool2/MaxPool, v0/tower_0/gradients/v0/tower_0/Reshape_grad/Reshape, v0/tower_0/mpool2/MaxPool:1, DMT/_57, DMT/_58, v0/tower_0/gradients/v0/tower_0/Reshape_grad/Reshape:1, v0/tower_0/mpool2/MaxPool:3)]]