Huggingface Bert TPU fine-tuning works on Colab but not in GCP

问题

I'm trying to fine-tune a Huggingface transformers BERT model on TPU. It works in Colab but fails when I switch to a paid TPU on GCP. Jupyter notebook code is as follows:

[1] model = transformers.TFBertModel.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
# works
[2] cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
    tpu='[My TPU]',
    zone='us-central1-a',
    project='[My Project]'
)
tf.config.experimental_connect_to_cluster(cluster_resolver)
tf.tpu.experimental.initialize_tpu_system(cluster_resolver)
tpu_strategy = tf.distribute.experimental.TPUStrategy(cluster_resolver)
#Also works. Got a bunch of startup messages from the TPU - all good.

[3] with tpu_strategy.scope():
    model = TFBertModel.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
#Generates the error below (long). Same line works in Colab.

Here's the error message:

NotFoundError                             Traceback (most recent call last)
<ipython-input-14-2cfc1a238903> in <module>
      1 with tpu_strategy.scope():
----> 2     model = TFBertModel.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

~/.local/lib/python3.5/site-packages/transformers/modeling_tf_utils.py in from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs)
    309             return load_pytorch_checkpoint_in_tf2_model(model, resolved_archive_file, allow_missing_keys=True)
    310 
--> 311         ret = model(model.dummy_inputs, training=False)  # build the network with dummy inputs
    312 
    313         assert os.path.isfile(resolved_archive_file), "Error retrieving file {}".format(resolved_archive_file)

/usr/local/lib/python3.5/dist-packages/tensorflow_core/python/keras/engine/base_layer.py in __call__(self, inputs, *args, **kwargs)
    820           with base_layer_utils.autocast_context_manager(
    821               self._compute_dtype):
--> 822             outputs = self.call(cast_inputs, *args, **kwargs)
    823           self._handle_activity_regularization(inputs, outputs)
    824           self._set_mask_metadata(inputs, outputs, input_masks)

~/.local/lib/python3.5/site-packages/transformers/modeling_tf_bert.py in call(self, inputs, **kwargs)
    688 
    689     def call(self, inputs, **kwargs):
--> 690         outputs = self.bert(inputs, **kwargs)
    691         return outputs
    692 

/usr/local/lib/python3.5/dist-packages/tensorflow_core/python/keras/engine/base_layer.py in __call__(self, inputs, *args, **kwargs)
    820           with base_layer_utils.autocast_context_manager(
    821               self._compute_dtype):
--> 822             outputs = self.call(cast_inputs, *args, **kwargs)
    823           self._handle_activity_regularization(inputs, outputs)
    824           self._set_mask_metadata(inputs, outputs, input_masks)

~/.local/lib/python3.5/site-packages/transformers/modeling_tf_bert.py in call(self, inputs, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, training)
    548 
    549         embedding_output = self.embeddings([input_ids, position_ids, token_type_ids, inputs_embeds], training=training)
--> 550         encoder_outputs = self.encoder([embedding_output, extended_attention_mask, head_mask], training=training)
    551 
    552         sequence_output = encoder_outputs[0]

/usr/local/lib/python3.5/dist-packages/tensorflow_core/python/keras/engine/base_layer.py in __call__(self, inputs, *args, **kwargs)
    820           with base_layer_utils.autocast_context_manager(
    821               self._compute_dtype):
--> 822             outputs = self.call(cast_inputs, *args, **kwargs)
    823           self._handle_activity_regularization(inputs, outputs)
    824           self._set_mask_metadata(inputs, outputs, input_masks)

~/.local/lib/python3.5/site-packages/transformers/modeling_tf_bert.py in call(self, inputs, training)
    365                 all_hidden_states = all_hidden_states + (hidden_states,)
    366 
--> 367             layer_outputs = layer_module([hidden_states, attention_mask, head_mask[i]], training=training)
    368             hidden_states = layer_outputs[0]
    369 

/usr/local/lib/python3.5/dist-packages/tensorflow_core/python/keras/engine/base_layer.py in __call__(self, inputs, *args, **kwargs)
    820           with base_layer_utils.autocast_context_manager(
    821               self._compute_dtype):
--> 822             outputs = self.call(cast_inputs, *args, **kwargs)
    823           self._handle_activity_regularization(inputs, outputs)
    824           self._set_mask_metadata(inputs, outputs, input_masks)

~/.local/lib/python3.5/site-packages/transformers/modeling_tf_bert.py in call(self, inputs, training)
    341         hidden_states, attention_mask, head_mask = inputs
    342 
--> 343         attention_outputs = self.attention([hidden_states, attention_mask, head_mask], training=training)
    344         attention_output = attention_outputs[0]
    345         intermediate_output = self.intermediate(attention_output)

/usr/local/lib/python3.5/dist-packages/tensorflow_core/python/keras/engine/base_layer.py in __call__(self, inputs, *args, **kwargs)
    820           with base_layer_utils.autocast_context_manager(
    821               self._compute_dtype):
--> 822             outputs = self.call(cast_inputs, *args, **kwargs)
    823           self._handle_activity_regularization(inputs, outputs)
    824           self._set_mask_metadata(inputs, outputs, input_masks)

~/.local/lib/python3.5/site-packages/transformers/modeling_tf_bert.py in call(self, inputs, training)
    290         input_tensor, attention_mask, head_mask = inputs
    291 
--> 292         self_outputs = self.self_attention([input_tensor, attention_mask, head_mask], training=training)
    293         attention_output = self.dense_output([self_outputs[0], input_tensor], training=training)
    294         outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them

/usr/local/lib/python3.5/dist-packages/tensorflow_core/python/keras/engine/base_layer.py in __call__(self, inputs, *args, **kwargs)
    820           with base_layer_utils.autocast_context_manager(
    821               self._compute_dtype):
--> 822             outputs = self.call(cast_inputs, *args, **kwargs)
    823           self._handle_activity_regularization(inputs, outputs)
    824           self._set_mask_metadata(inputs, outputs, input_masks)

~/.local/lib/python3.5/site-packages/transformers/modeling_tf_bert.py in call(self, inputs, training)
    222 
    223         batch_size = shape_list(hidden_states)[0]
--> 224         mixed_query_layer = self.query(hidden_states)
    225         mixed_key_layer = self.key(hidden_states)
    226         mixed_value_layer = self.value(hidden_states)

/usr/local/lib/python3.5/dist-packages/tensorflow_core/python/keras/engine/base_layer.py in __call__(self, inputs, *args, **kwargs)
    820           with base_layer_utils.autocast_context_manager(
    821               self._compute_dtype):
--> 822             outputs = self.call(cast_inputs, *args, **kwargs)
    823           self._handle_activity_regularization(inputs, outputs)
    824           self._set_mask_metadata(inputs, outputs, input_masks)

/usr/local/lib/python3.5/dist-packages/tensorflow_core/python/keras/layers/core.py in call(self, inputs)
   1142         outputs = gen_math_ops.mat_mul(inputs, self.kernel)
   1143     if self.use_bias:
-> 1144       outputs = nn.bias_add(outputs, self.bias)
   1145     if self.activation is not None:
   1146       return self.activation(outputs)  # pylint: disable=not-callable

/usr/local/lib/python3.5/dist-packages/tensorflow_core/python/ops/nn_ops.py in bias_add(value, bias, data_format, name)
   2756     else:
   2757       return gen_nn_ops.bias_add(
-> 2758           value, bias, data_format=data_format, name=name)
   2759 
   2760 

/usr/local/lib/python3.5/dist-packages/tensorflow_core/python/ops/gen_nn_ops.py in bias_add(value, bias, data_format, name)
    675       try:
    676         return bias_add_eager_fallback(
--> 677             value, bias, data_format=data_format, name=name, ctx=_ctx)
    678       except _core._SymbolicException:
    679         pass  # Add nodes to the TensorFlow graph.

/usr/local/lib/python3.5/dist-packages/tensorflow_core/python/ops/gen_nn_ops.py in bias_add_eager_fallback(value, bias, data_format, name, ctx)
    703     data_format = "NHWC"
    704   data_format = _execute.make_str(data_format, "data_format")
--> 705   _attr_T, _inputs_T = _execute.args_to_matching_eager([value, bias], ctx)
    706   (value, bias) = _inputs_T
    707   _inputs_flat = [value, bias]

/usr/local/lib/python3.5/dist-packages/tensorflow_core/python/eager/execute.py in args_to_matching_eager(l, ctx, default_dtype)
    265         dtype = ret[-1].dtype
    266   else:
--> 267     ret = [ops.convert_to_tensor(t, dtype, ctx=ctx) for t in l]
    268 
    269   # TODO(slebedev): consider removing this as it leaks a Keras concept.

/usr/local/lib/python3.5/dist-packages/tensorflow_core/python/eager/execute.py in <listcomp>(.0)
    265         dtype = ret[-1].dtype
    266   else:
--> 267     ret = [ops.convert_to_tensor(t, dtype, ctx=ctx) for t in l]
    268 
    269   # TODO(slebedev): consider removing this as it leaks a Keras concept.

/usr/local/lib/python3.5/dist-packages/tensorflow_core/python/framework/ops.py in convert_to_tensor(value, dtype, name, as_ref, preferred_dtype, dtype_hint, ctx, accepted_result_types)
   1312 
   1313     if ret is None:
-> 1314       ret = conversion_func(value, dtype=dtype, name=name, as_ref=as_ref)
   1315 
   1316     if ret is NotImplemented:

/usr/local/lib/python3.5/dist-packages/tensorflow_core/python/distribute/values.py in _tensor_conversion_mirrored(var, dtype, name, as_ref)
   1174 # allowing instances of the class to be used as tensors.
   1175 def _tensor_conversion_mirrored(var, dtype=None, name=None, as_ref=False):
-> 1176   return var._dense_var_to_tensor(dtype=dtype, name=name, as_ref=as_ref)  # pylint: disable=protected-access
   1177 
   1178 

/usr/local/lib/python3.5/dist-packages/tensorflow_core/python/distribute/values.py in _dense_var_to_tensor(self, dtype, name, as_ref)
    908     if _enclosing_tpu_context() is None:
    909       return super(TPUVariableMixin, self)._dense_var_to_tensor(
--> 910           dtype=dtype, name=name, as_ref=as_ref)
    911     # pylint: enable=protected-access
    912     elif dtype is not None and dtype != self.dtype:

/usr/local/lib/python3.5/dist-packages/tensorflow_core/python/distribute/values.py in _dense_var_to_tensor(self, dtype, name, as_ref)
   1164     assert not as_ref
   1165     return ops.convert_to_tensor(
-> 1166         self.get(), dtype=dtype, name=name, as_ref=as_ref)
   1167 
   1168   def _clone_with_new_values(self, new_values):

/usr/local/lib/python3.5/dist-packages/tensorflow_core/python/distribute/values.py in get(self, device)
    835   def get(self, device=None):
    836     if (_enclosing_tpu_context() is None) or (device is not None):
--> 837       return super(TPUVariableMixin, self).get(device=device)
    838     else:
    839       raise NotImplementedError(

/usr/local/lib/python3.5/dist-packages/tensorflow_core/python/distribute/values.py in get(self, device)
    320         device = distribute_lib.get_update_device()
    321         if device is None:
--> 322           return self._get_cross_replica()
    323     device = device_util.canonicalize(device)
    324     return self._device_map.select_for_device(self._values, device)

/usr/local/lib/python3.5/dist-packages/tensorflow_core/python/distribute/values.py in _get_cross_replica(self)
   1136     replica_id = self._device_map.replica_for_device(device)
   1137     if replica_id is None:
-> 1138       return array_ops.identity(self.primary)
   1139     return array_ops.identity(self._values[replica_id])
   1140 

/usr/local/lib/python3.5/dist-packages/tensorflow_core/python/util/dispatch.py in wrapper(*args, **kwargs)
    178     """Call target, and fall back on dispatchers if there is a TypeError."""
    179     try:
--> 180       return target(*args, **kwargs)
    181     except (TypeError, ValueError):
    182       # Note: convert_to_eager_tensor currently raises a ValueError, not a

/usr/local/lib/python3.5/dist-packages/tensorflow_core/python/ops/array_ops.py in identity(input, name)
    265     # variables. Variables have correct handle data when graph building.
    266     input = ops.convert_to_tensor(input)
--> 267   ret = gen_array_ops.identity(input, name=name)
    268   # Propagate handle data for happier shape inference for resource variables.
    269   if hasattr(input, "_handle_data"):

/usr/local/lib/python3.5/dist-packages/tensorflow_core/python/ops/gen_array_ops.py in identity(input, name)
   3824         pass  # Add nodes to the TensorFlow graph.
   3825     except _core._NotOkStatusException as e:
-> 3826       _ops.raise_from_not_ok_status(e, name)
   3827   # Add nodes to the TensorFlow graph.
   3828   _, _, _op, _outputs = _op_def_library._apply_op_helper(

/usr/local/lib/python3.5/dist-packages/tensorflow_core/python/framework/ops.py in raise_from_not_ok_status(e, name)
   6604   message = e.message + (" name: " + name if name is not None else "")
   6605   # pylint: disable=protected-access
-> 6606   six.raise_from(core._status_to_exception(e.code, message), None)
   6607   # pylint: enable=protected-access
   6608 

/usr/local/lib/python3.5/dist-packages/six.py in raise_from(value, from_value)

NotFoundError: '_MklMatMul' is neither a type of a primitive operation nor a name of a function registered in binary running on n-aa2fcfb7-w-0. One possible root cause is the client and server binaries are not built with the same version. Please make sure the operation or function is registered in the binary running in this process. [Op:Identity]

I posted this on the Huggingface github (https://github.com/huggingface/transformers/issues/2572) and they suggest the TPU server version may not match the TPU client version, but a) I don't know how to check for that nor b) what to do about it. Suggestions appreciated.

来源：https://stackoverflow.com/questions/59851553/huggingface-bert-tpu-fine-tuning-works-on-colab-but-not-in-gcp

标签

google-cloud-platform

google-colaboratory

google-cloud-tpu

BERT

huggingface-transformers