Fix token_type_ids error for big_bird model. (#11355)

* MOD: fit chinese wwm to new datasets * MOD: move wwm to new folder * MOD: formate code * Styling * MOD add param and recover trainer * MOD: add token_type_ids method for big bird * MOD: format code * MOD: format code Co-authored-by: Sylvain Gugger <sylvain.gugger@gmail.com>
2025-07-31 02:02:21 +06:00 · 2021-04-22 01:37:57 +08:00 · 2021-04-22 01:37:57 +08:00 · 5e04d70868
commit 5e04d70868
parent 5aaf5aac0b
1 changed files with 24 additions and 0 deletions
--- a/src/transformers/models/big_bird/tokenization_big_bird.py
+++ b/src/transformers/models/big_bird/tokenization_big_bird.py
@ -226,3 +226,27 @@ class BigBirdTokenizer(PreTrainedTokenizer):
        if token_ids_1 is None:
            return [1] + ([0] * len(token_ids_0)) + [1]
        return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
+        pair mask has the following format: :: 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 | first sequence | second
+        sequence | If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            sequence(s).
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]