transformers/examples/research_projects/codeparrot/scripts/tests/test_deduplicate.py
Sylvain Gugger 6f79d26442
Update quality tooling for formatting (#21480)
* Result of black 23.1

* Update target to Python 3.7

* Switch flake8 to ruff

* Configure isort

* Configure isort

* Apply isort with line limit

* Put the right black version

* adapt black in check copies

* Fix copies
2023-02-06 18:10:56 -05:00

30 lines
1004 B
Python

from unittest import TestCase
from datasets import Dataset
from minhash_deduplication import deduplicate_dataset, make_duplicate_clusters
def get_dataset():
data_dict = {
"repo_name": ["test_repo1", "test_repo2", "test_repo3"],
"path": ["test_1.py", "test_2.py", "unit_test.py"],
"content": ["a " * 20, "a " * 30, "b " * 7],
}
dataset = Dataset.from_dict(data_dict)
return dataset
class MakeDuplicateClustersTest(TestCase):
def test_make_duplicate_clusters(self):
ds = get_dataset()
duplicate_clusters = make_duplicate_clusters(ds, 0.85)
self.assertEqual(len(duplicate_clusters[0]), 2)
def test_deduplicate_dataset(self):
ds = get_dataset()
ds_filter, duplicate_clusters = deduplicate_dataset(ds)
self.assertEqual(len(ds_filter), 2)
print(duplicate_clusters)
self.assertEqual(duplicate_clusters[0][0]["copies"], 2)
self.assertEqual(duplicate_clusters[0][0]["is_extreme"], True)