mirror of
https://github.com/huggingface/transformers.git
synced 2025-08-03 03:31:05 +06:00
Norod78/hewiki-articles-distilGPT2py-il model card (#5735)
Model card for hewiki-articles-distilGPT2py-il A tiny GPT2 model for generating Hebrew text
This commit is contained in:
parent
dae244ad89
commit
2db1cc807b
116
model_cards/Norod78/hewiki-articles-distilGPT2py-il/README.md
Normal file
116
model_cards/Norod78/hewiki-articles-distilGPT2py-il/README.md
Normal file
@ -0,0 +1,116 @@
|
||||
---
|
||||
language: hebrew
|
||||
tags:
|
||||
- pytorch
|
||||
- tf
|
||||
- gpt2
|
||||
- lm-head
|
||||
- causal-lm
|
||||
- pipeline:text-generation
|
||||
|
||||
thumbnail: https://avatars1.githubusercontent.com/u/3617152?norod.jpg
|
||||
widget:
|
||||
- text: "<|startoftext|>החוק השני של מועדון קרב הוא"
|
||||
- text: "<|startoftext|>ראש הממשלה בן גוריון"
|
||||
- text: "<|startoftext|>למידת מכונה (סרט)"
|
||||
- text: "<|startoftext|>מנשה פומפרניקל"
|
||||
- text: "<|startoftext|>אי שוויון "
|
||||
|
||||
license: mit
|
||||
---
|
||||
|
||||
|
||||
# hewiki-articles-distilGPT2py-il
|
||||
|
||||
## A tiny GPT2 model for generating Hebrew text
|
||||
|
||||
A distilGPT2 sized model. <br>
|
||||
Training data was hewiki-20200701-pages-articles-multistream.xml.bz2 from https://dumps.wikimedia.org/hewiki/20200701/ <br>
|
||||
XML has been converted to plain text using Wikipedia Extractor http://medialab.di.unipi.it/wiki/Wikipedia_Extractor <br>
|
||||
I then added <|startoftext|> and <|endoftext|> markers and deleted empty lines. <br>
|
||||
|
||||
#### How to use
|
||||
|
||||
```python
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from transformers import GPT2Tokenizer, GPT2LMHeadModel
|
||||
|
||||
tokenizer = GPT2Tokenizer.from_pretrained("Norod78/hewiki-articles-distilGPT2py-il")
|
||||
model = GPT2LMHeadModel.from_pretrained("Norod78/hewiki-articles-distilGPT2py-il").eval()
|
||||
|
||||
bos_token = tokenizer.bos_token #Beginning of sentace
|
||||
eos_token = tokenizer.eos_token #End of sentence
|
||||
|
||||
def generate_word(model, tokens_tensor, temperature=1.0):
|
||||
"""
|
||||
Sample a word given a tensor of tokens of previous words from a model. Given
|
||||
the words we have, sample a plausible word. Temperature is used for
|
||||
controlling randomness. If using temperature==0 we simply use a greedy arg max.
|
||||
Else, we sample from a multinomial distribution using a lower inverse
|
||||
temperature to allow for more randomness to escape repetitions.
|
||||
"""
|
||||
with torch.no_grad():
|
||||
outputs = model(tokens_tensor)
|
||||
predictions = outputs[0]
|
||||
if temperature>0:
|
||||
# Make the distribution more or less skewed based on the temperature
|
||||
predictions = outputs[0]/temperature
|
||||
# Sample from the distribution
|
||||
softmax = nn.Softmax(dim=0)
|
||||
predicted_index = torch.multinomial(softmax(predictions[0,-1,:]),1).item()
|
||||
# Simply take the arg-max of the distribution
|
||||
else:
|
||||
predicted_index = torch.argmax(predictions[0, -1, :]).item()
|
||||
# Decode the encoding to the corresponding word
|
||||
predicted_text = tokenizer.decode([predicted_index])
|
||||
return predicted_text
|
||||
|
||||
def generate_sentence(model, tokenizer, initial_text, temperature=1.0):
|
||||
""" Generate a sentence given some initial text using a model and a tokenizer.
|
||||
Returns the new sentence. """
|
||||
|
||||
# Encode a text inputs
|
||||
text = ""
|
||||
sentence = text
|
||||
|
||||
# We avoid an infinite loop by setting a maximum range
|
||||
for i in range(0,84):
|
||||
indexed_tokens = tokenizer.encode(initial_text + text)
|
||||
|
||||
# Convert indexed tokens in a PyTorch tensor
|
||||
tokens_tensor = torch.tensor([indexed_tokens])
|
||||
|
||||
new_word = generate_word(model, tokens_tensor, temperature=temperature)
|
||||
|
||||
# Here the temperature is slowly decreased with each generated word,
|
||||
# this ensures that the sentence (ending) makes more sense.
|
||||
# We don't decrease to a temperature of 0.0 to leave some randomness in.
|
||||
if temperature<(1-0.008):
|
||||
temperature += 0.008
|
||||
else:
|
||||
temperature = 0.996
|
||||
|
||||
text = text+new_word
|
||||
|
||||
# Stop generating new words when we have reached the end of the line or the poem
|
||||
if eos_token in new_word:
|
||||
# returns new sentence and whether poem is done
|
||||
return (text.replace(eos_token,"").strip(), True)
|
||||
elif '/' in new_word:
|
||||
return (text.strip(), False)
|
||||
elif bos_token in new_word:
|
||||
return (text.replace(bos_token,"").strip(), False)
|
||||
|
||||
return (text, True)
|
||||
|
||||
for output_num in range(1,5):
|
||||
init_text = "בוקר טוב"
|
||||
text = bos_token + init_text
|
||||
for i in range(0,84):
|
||||
sentence = generate_sentence(model, tokenizer, text, temperature=0.9)
|
||||
text = init_text + sentence[0]
|
||||
print(text)
|
||||
if (sentence[1] == True):
|
||||
break
|
||||
```
|
Loading…
Reference in New Issue
Block a user