问题
I'm trying to fine-tune a Huggingface transformers BERT model on TPU. It works in Colab but fails when I switch to a paid TPU on GCP. Jupyter notebook code is as follows:
[1] model = transformers.TFBertModel.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
# works
[2] cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
tpu='[My TPU]',
zone='us-central1-a',
project='[My Project]'
)
tf.config.experimental_connect_to_cluster(cluster_resolver)
tf.tpu.experimental.initialize_tpu_system(cluster_resolver)
tpu_strategy = tf.distribute.experimental.TPUStrategy(cluster_resolver)
#Also works. Got a bunch of startup messages from the TPU - all good.
[3] with tpu_strategy.scope():
model = TFBertModel.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
#Generates the error below (long). Same line works in Colab.
Here's the error message:
NotFoundError Traceback (most recent call last)
<ipython-input-14-2cfc1a238903> in <module>
1 with tpu_strategy.scope():
----> 2 model = TFBertModel.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
~/.local/lib/python3.5/site-packages/transformers/modeling_tf_utils.py in from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs)
309 return load_pytorch_checkpoint_in_tf2_model(model, resolved_archive_file, allow_missing_keys=True)
310
--> 311 ret = model(model.dummy_inputs, training=False) # build the network with dummy inputs
312
313 assert os.path.isfile(resolved_archive_file), "Error retrieving file {}".format(resolved_archive_file)
/usr/local/lib/python3.5/dist-packages/tensorflow_core/python/keras/engine/base_layer.py in __call__(self, inputs, *args, **kwargs)
820 with base_layer_utils.autocast_context_manager(
821 self._compute_dtype):
--> 822 outputs = self.call(cast_inputs, *args, **kwargs)
823 self._handle_activity_regularization(inputs, outputs)
824 self._set_mask_metadata(inputs, outputs, input_masks)
~/.local/lib/python3.5/site-packages/transformers/modeling_tf_bert.py in call(self, inputs, **kwargs)
688
689 def call(self, inputs, **kwargs):
--> 690 outputs = self.bert(inputs, **kwargs)
691 return outputs
692
/usr/local/lib/python3.5/dist-packages/tensorflow_core/python/keras/engine/base_layer.py in __call__(self, inputs, *args, **kwargs)
820 with base_layer_utils.autocast_context_manager(
821 self._compute_dtype):
--> 822 outputs = self.call(cast_inputs, *args, **kwargs)
823 self._handle_activity_regularization(inputs, outputs)
824 self._set_mask_metadata(inputs, outputs, input_masks)
~/.local/lib/python3.5/site-packages/transformers/modeling_tf_bert.py in call(self, inputs, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, training)
548
549 embedding_output = self.embeddings([input_ids, position_ids, token_type_ids, inputs_embeds], training=training)
--> 550 encoder_outputs = self.encoder([embedding_output, extended_attention_mask, head_mask], training=training)
551
552 sequence_output = encoder_outputs[0]
/usr/local/lib/python3.5/dist-packages/tensorflow_core/python/keras/engine/base_layer.py in __call__(self, inputs, *args, **kwargs)
820 with base_layer_utils.autocast_context_manager(
821 self._compute_dtype):
--> 822 outputs = self.call(cast_inputs, *args, **kwargs)
823 self._handle_activity_regularization(inputs, outputs)
824 self._set_mask_metadata(inputs, outputs, input_masks)
~/.local/lib/python3.5/site-packages/transformers/modeling_tf_bert.py in call(self, inputs, training)
365 all_hidden_states = all_hidden_states + (hidden_states,)
366
--> 367 layer_outputs = layer_module([hidden_states, attention_mask, head_mask[i]], training=training)
368 hidden_states = layer_outputs[0]
369
/usr/local/lib/python3.5/dist-packages/tensorflow_core/python/keras/engine/base_layer.py in __call__(self, inputs, *args, **kwargs)
820 with base_layer_utils.autocast_context_manager(
821 self._compute_dtype):
--> 822 outputs = self.call(cast_inputs, *args, **kwargs)
823 self._handle_activity_regularization(inputs, outputs)
824 self._set_mask_metadata(inputs, outputs, input_masks)
~/.local/lib/python3.5/site-packages/transformers/modeling_tf_bert.py in call(self, inputs, training)
341 hidden_states, attention_mask, head_mask = inputs
342
--> 343 attention_outputs = self.attention([hidden_states, attention_mask, head_mask], training=training)
344 attention_output = attention_outputs[0]
345 intermediate_output = self.intermediate(attention_output)
/usr/local/lib/python3.5/dist-packages/tensorflow_core/python/keras/engine/base_layer.py in __call__(self, inputs, *args, **kwargs)
820 with base_layer_utils.autocast_context_manager(
821 self._compute_dtype):
--> 822 outputs = self.call(cast_inputs, *args, **kwargs)
823 self._handle_activity_regularization(inputs, outputs)
824 self._set_mask_metadata(inputs, outputs, input_masks)
~/.local/lib/python3.5/site-packages/transformers/modeling_tf_bert.py in call(self, inputs, training)
290 input_tensor, attention_mask, head_mask = inputs
291
--> 292 self_outputs = self.self_attention([input_tensor, attention_mask, head_mask], training=training)
293 attention_output = self.dense_output([self_outputs[0], input_tensor], training=training)
294 outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them
/usr/local/lib/python3.5/dist-packages/tensorflow_core/python/keras/engine/base_layer.py in __call__(self, inputs, *args, **kwargs)
820 with base_layer_utils.autocast_context_manager(
821 self._compute_dtype):
--> 822 outputs = self.call(cast_inputs, *args, **kwargs)
823 self._handle_activity_regularization(inputs, outputs)
824 self._set_mask_metadata(inputs, outputs, input_masks)
~/.local/lib/python3.5/site-packages/transformers/modeling_tf_bert.py in call(self, inputs, training)
222
223 batch_size = shape_list(hidden_states)[0]
--> 224 mixed_query_layer = self.query(hidden_states)
225 mixed_key_layer = self.key(hidden_states)
226 mixed_value_layer = self.value(hidden_states)
/usr/local/lib/python3.5/dist-packages/tensorflow_core/python/keras/engine/base_layer.py in __call__(self, inputs, *args, **kwargs)
820 with base_layer_utils.autocast_context_manager(
821 self._compute_dtype):
--> 822 outputs = self.call(cast_inputs, *args, **kwargs)
823 self._handle_activity_regularization(inputs, outputs)
824 self._set_mask_metadata(inputs, outputs, input_masks)
/usr/local/lib/python3.5/dist-packages/tensorflow_core/python/keras/layers/core.py in call(self, inputs)
1142 outputs = gen_math_ops.mat_mul(inputs, self.kernel)
1143 if self.use_bias:
-> 1144 outputs = nn.bias_add(outputs, self.bias)
1145 if self.activation is not None:
1146 return self.activation(outputs) # pylint: disable=not-callable
/usr/local/lib/python3.5/dist-packages/tensorflow_core/python/ops/nn_ops.py in bias_add(value, bias, data_format, name)
2756 else:
2757 return gen_nn_ops.bias_add(
-> 2758 value, bias, data_format=data_format, name=name)
2759
2760
/usr/local/lib/python3.5/dist-packages/tensorflow_core/python/ops/gen_nn_ops.py in bias_add(value, bias, data_format, name)
675 try:
676 return bias_add_eager_fallback(
--> 677 value, bias, data_format=data_format, name=name, ctx=_ctx)
678 except _core._SymbolicException:
679 pass # Add nodes to the TensorFlow graph.
/usr/local/lib/python3.5/dist-packages/tensorflow_core/python/ops/gen_nn_ops.py in bias_add_eager_fallback(value, bias, data_format, name, ctx)
703 data_format = "NHWC"
704 data_format = _execute.make_str(data_format, "data_format")
--> 705 _attr_T, _inputs_T = _execute.args_to_matching_eager([value, bias], ctx)
706 (value, bias) = _inputs_T
707 _inputs_flat = [value, bias]
/usr/local/lib/python3.5/dist-packages/tensorflow_core/python/eager/execute.py in args_to_matching_eager(l, ctx, default_dtype)
265 dtype = ret[-1].dtype
266 else:
--> 267 ret = [ops.convert_to_tensor(t, dtype, ctx=ctx) for t in l]
268
269 # TODO(slebedev): consider removing this as it leaks a Keras concept.
/usr/local/lib/python3.5/dist-packages/tensorflow_core/python/eager/execute.py in <listcomp>(.0)
265 dtype = ret[-1].dtype
266 else:
--> 267 ret = [ops.convert_to_tensor(t, dtype, ctx=ctx) for t in l]
268
269 # TODO(slebedev): consider removing this as it leaks a Keras concept.
/usr/local/lib/python3.5/dist-packages/tensorflow_core/python/framework/ops.py in convert_to_tensor(value, dtype, name, as_ref, preferred_dtype, dtype_hint, ctx, accepted_result_types)
1312
1313 if ret is None:
-> 1314 ret = conversion_func(value, dtype=dtype, name=name, as_ref=as_ref)
1315
1316 if ret is NotImplemented:
/usr/local/lib/python3.5/dist-packages/tensorflow_core/python/distribute/values.py in _tensor_conversion_mirrored(var, dtype, name, as_ref)
1174 # allowing instances of the class to be used as tensors.
1175 def _tensor_conversion_mirrored(var, dtype=None, name=None, as_ref=False):
-> 1176 return var._dense_var_to_tensor(dtype=dtype, name=name, as_ref=as_ref) # pylint: disable=protected-access
1177
1178
/usr/local/lib/python3.5/dist-packages/tensorflow_core/python/distribute/values.py in _dense_var_to_tensor(self, dtype, name, as_ref)
908 if _enclosing_tpu_context() is None:
909 return super(TPUVariableMixin, self)._dense_var_to_tensor(
--> 910 dtype=dtype, name=name, as_ref=as_ref)
911 # pylint: enable=protected-access
912 elif dtype is not None and dtype != self.dtype:
/usr/local/lib/python3.5/dist-packages/tensorflow_core/python/distribute/values.py in _dense_var_to_tensor(self, dtype, name, as_ref)
1164 assert not as_ref
1165 return ops.convert_to_tensor(
-> 1166 self.get(), dtype=dtype, name=name, as_ref=as_ref)
1167
1168 def _clone_with_new_values(self, new_values):
/usr/local/lib/python3.5/dist-packages/tensorflow_core/python/distribute/values.py in get(self, device)
835 def get(self, device=None):
836 if (_enclosing_tpu_context() is None) or (device is not None):
--> 837 return super(TPUVariableMixin, self).get(device=device)
838 else:
839 raise NotImplementedError(
/usr/local/lib/python3.5/dist-packages/tensorflow_core/python/distribute/values.py in get(self, device)
320 device = distribute_lib.get_update_device()
321 if device is None:
--> 322 return self._get_cross_replica()
323 device = device_util.canonicalize(device)
324 return self._device_map.select_for_device(self._values, device)
/usr/local/lib/python3.5/dist-packages/tensorflow_core/python/distribute/values.py in _get_cross_replica(self)
1136 replica_id = self._device_map.replica_for_device(device)
1137 if replica_id is None:
-> 1138 return array_ops.identity(self.primary)
1139 return array_ops.identity(self._values[replica_id])
1140
/usr/local/lib/python3.5/dist-packages/tensorflow_core/python/util/dispatch.py in wrapper(*args, **kwargs)
178 """Call target, and fall back on dispatchers if there is a TypeError."""
179 try:
--> 180 return target(*args, **kwargs)
181 except (TypeError, ValueError):
182 # Note: convert_to_eager_tensor currently raises a ValueError, not a
/usr/local/lib/python3.5/dist-packages/tensorflow_core/python/ops/array_ops.py in identity(input, name)
265 # variables. Variables have correct handle data when graph building.
266 input = ops.convert_to_tensor(input)
--> 267 ret = gen_array_ops.identity(input, name=name)
268 # Propagate handle data for happier shape inference for resource variables.
269 if hasattr(input, "_handle_data"):
/usr/local/lib/python3.5/dist-packages/tensorflow_core/python/ops/gen_array_ops.py in identity(input, name)
3824 pass # Add nodes to the TensorFlow graph.
3825 except _core._NotOkStatusException as e:
-> 3826 _ops.raise_from_not_ok_status(e, name)
3827 # Add nodes to the TensorFlow graph.
3828 _, _, _op, _outputs = _op_def_library._apply_op_helper(
/usr/local/lib/python3.5/dist-packages/tensorflow_core/python/framework/ops.py in raise_from_not_ok_status(e, name)
6604 message = e.message + (" name: " + name if name is not None else "")
6605 # pylint: disable=protected-access
-> 6606 six.raise_from(core._status_to_exception(e.code, message), None)
6607 # pylint: enable=protected-access
6608
/usr/local/lib/python3.5/dist-packages/six.py in raise_from(value, from_value)
NotFoundError: '_MklMatMul' is neither a type of a primitive operation nor a name of a function registered in binary running on n-aa2fcfb7-w-0. One possible root cause is the client and server binaries are not built with the same version. Please make sure the operation or function is registered in the binary running in this process. [Op:Identity]
I posted this on the Huggingface github (https://github.com/huggingface/transformers/issues/2572) and they suggest the TPU server version may not match the TPU client version, but a) I don't know how to check for that nor b) what to do about it. Suggestions appreciated.
来源:https://stackoverflow.com/questions/59851553/huggingface-bert-tpu-fine-tuning-works-on-colab-but-not-in-gcp