split-test / split-test.py

Initial commit

b4bb178 almost 4 years ago

9.72 kB

	# coding=utf-8
	# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	"""The Tweet Eval Datasets"""


	import datasets


	_CITATION = """\
	@inproceedings{barbieri2020tweeteval,
	title={{TweetEval:Unified Benchmark and Comparative Evaluation for Tweet Classification}},
	author={Barbieri, Francesco and Camacho-Collados, Jose and Espinosa-Anke, Luis and Neves, Leonardo},
	booktitle={Proceedings of Findings of EMNLP},
	year={2020}
	}
	"""

	_DESCRIPTION = """\
	TweetEval consists of seven heterogenous tasks in Twitter, all framed as multi-class tweet classification. All tasks have been unified into the same benchmark, with each dataset presented in the same format and with fixed training, validation and test splits.
	"""

	_HOMEPAGE = "https://github.com/cardiffnlp/tweeteval"

	_LICENSE = ""

	URL = "https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/"

	_URLs = {
	"emoji": {
	"train_text": URL + "emoji/train_text.txt",
	"train_labels": URL + "emoji/train_labels.txt",
	"test_text": URL + "emoji/test_text.txt",
	"test_labels": URL + "emoji/test_labels.txt",
	"val_text": URL + "emoji/val_text.txt",
	"val_labels": URL + "emoji/val_labels.txt",
	},
	"emotion": {
	"train_text": URL + "emotion/train_text.txt",
	"train_labels": URL + "emotion/train_labels.txt",
	"test_text": URL + "emotion/test_text.txt",
	"test_labels": URL + "emotion/test_labels.txt",
	"val_text": URL + "emotion/val_text.txt",
	"val_labels": URL + "emotion/val_labels.txt",
	},
	"hate": {
	"train_text": URL + "hate/train_text.txt",
	"train_labels": URL + "hate/train_labels.txt",
	"test_text": URL + "hate/test_text.txt",
	"test_labels": URL + "hate/test_labels.txt",
	"val_text": URL + "hate/val_text.txt",
	"val_labels": URL + "hate/val_labels.txt",
	},
	"irony": {
	"train_text": URL + "irony/train_text.txt",
	"train_labels": URL + "irony/train_labels.txt",
	"test_text": URL + "irony/test_text.txt",
	"test_labels": URL + "irony/test_labels.txt",
	"val_text": URL + "irony/val_text.txt",
	"val_labels": URL + "irony/val_labels.txt",
	},
	"offensive": {
	"train_text": URL + "offensive/train_text.txt",
	"train_labels": URL + "offensive/train_labels.txt",
	"test_text": URL + "offensive/test_text.txt",
	"test_labels": URL + "offensive/test_labels.txt",
	"val_text": URL + "offensive/val_text.txt",
	"val_labels": URL + "offensive/val_labels.txt",
	},
	"sentiment": {
	"train_text": URL + "sentiment/train_text.txt",
	"train_labels": URL + "sentiment/train_labels.txt",
	"test_text": URL + "sentiment/test_text.txt",
	"test_labels": URL + "sentiment/test_labels.txt",
	"val_text": URL + "sentiment/val_text.txt",
	"val_labels": URL + "sentiment/val_labels.txt",
	},
	"stance": {
	"abortion": {
	"train_text": URL + "stance/abortion/train_text.txt",
	"train_labels": URL + "stance/abortion/train_labels.txt",
	"test_text": URL + "stance/abortion/test_text.txt",
	"test_labels": URL + "stance/abortion/test_labels.txt",
	"val_text": URL + "stance/abortion/val_text.txt",
	"val_labels": URL + "stance/abortion/val_labels.txt",
	},
	"atheism": {
	"train_text": URL + "stance/atheism/train_text.txt",
	"train_labels": URL + "stance/atheism/train_labels.txt",
	"test_text": URL + "stance/atheism/test_text.txt",
	"test_labels": URL + "stance/atheism/test_labels.txt",
	"val_text": URL + "stance/atheism/val_text.txt",
	"val_labels": URL + "stance/atheism/val_labels.txt",
	},
	"climate": {
	"train_text": URL + "stance/climate/train_text.txt",
	"train_labels": URL + "stance/climate/train_labels.txt",
	"test_text": URL + "stance/climate/test_text.txt",
	"test_labels": URL + "stance/climate/test_labels.txt",
	"val_text": URL + "stance/climate/val_text.txt",
	"val_labels": URL + "stance/climate/val_labels.txt",
	},
	"feminist": {
	"train_text": URL + "stance/feminist/train_text.txt",
	"train_labels": URL + "stance/feminist/train_labels.txt",
	"test_text": URL + "stance/feminist/test_text.txt",
	"test_labels": URL + "stance/feminist/test_labels.txt",
	"val_text": URL + "stance/feminist/val_text.txt",
	"val_labels": URL + "stance/feminist/val_labels.txt",
	},
	"hillary": {
	"train_text": URL + "stance/hillary/train_text.txt",
	"train_labels": URL + "stance/hillary/train_labels.txt",
	"test_text": URL + "stance/hillary/test_text.txt",
	"test_labels": URL + "stance/hillary/test_labels.txt",
	"val_text": URL + "stance/hillary/val_text.txt",
	"val_labels": URL + "stance/hillary/val_labels.txt",
	},
	},
	}


	class TweetEvalConfig(datasets.BuilderConfig):
	def __init__(self, args, type=None, sub_type=None, *kwargs):
	super().__init__(
	*args,
	name=f"{type}" if type != "stance" else f"{type}_{sub_type}",
	**kwargs,
	)
	self.type = type
	self.sub_type = sub_type


	class TweetEval(datasets.GeneratorBasedBuilder):
	"""TweetEval Dataset."""

	BUILDER_CONFIGS = [
	TweetEvalConfig(
	type=key,
	sub_type=None,
	version=datasets.Version("1.1.0"),
	description=f"This part of my dataset covers {key} part of TweetEval Dataset.",
	)
	for key in list(_URLs.keys())
	if key != "stance"
	] + [
	TweetEvalConfig(
	type="stance",
	sub_type=key,
	version=datasets.Version("1.1.0"),
	description=f"This part of my dataset covers stance_{key} part of TweetEval Dataset.",
	)
	for key in list(_URLs["stance"].keys())
	]

	def _info(self):
	if self.config.type == "stance":
	names = ["none", "against", "favor"]
	elif self.config.type == "sentiment":
	names = ["negative", "neutral", "positive"]
	elif self.config.type == "offensive":
	names = ["non-offensive", "offensive"]
	elif self.config.type == "irony":
	names = ["non_irony", "irony"]
	elif self.config.type == "hate":
	names = ["non-hate", "hate"]
	elif self.config.type == "emoji":
	names = [
	"❤",
	"😍",
	"😂",
	"💕",
	"🔥",
	"😊",
	"😎",
	"✨",
	"💙",
	"😘",
	"📷",
	"🇺🇸",
	"☀",
	"💜",
	"😉",
	"💯",
	"😁",
	"🎄",
	"📸",
	"😜",
	]

	else:
	names = ["anger", "joy", "optimism", "sadness"]

	return datasets.DatasetInfo(
	description=_DESCRIPTION,
	features=datasets.Features(
	{"text": datasets.Value("string"), "label": datasets.features.ClassLabel(names=names)}
	),
	supervised_keys=None,
	homepage=_HOMEPAGE,
	license=_LICENSE,
	citation=_CITATION,
	)

	def _split_generators(self, dl_manager):
	"""Returns SplitGenerators."""
	if self.config.type != "stance":
	my_urls = _URLs[self.config.type]
	else:
	my_urls = _URLs[self.config.type][self.config.sub_type]
	data_dir = dl_manager.download_and_extract(my_urls)
	return [
	datasets.SplitGenerator(
	name=datasets.Split.TRAIN,
	# These kwargs will be passed to _generate_examples
	gen_kwargs={"text_path": data_dir["train_text"], "labels_path": data_dir["train_labels"]},
	),
	datasets.SplitGenerator(
	name=datasets.Split.TEST,
	# These kwargs will be passed to _generate_examples
	gen_kwargs={"text_path": data_dir["test_text"], "labels_path": data_dir["test_labels"]},
	),
	datasets.SplitGenerator(
	name=datasets.Split.VALIDATION,
	# These kwargs will be passed to _generate_examples
	gen_kwargs={"text_path": data_dir["val_text"], "labels_path": data_dir["val_labels"]},
	),
	]

	def _generate_examples(self, text_path, labels_path):
	"""Yields examples."""

	with open(text_path, encoding="utf-8") as f:
	texts = f.readlines()
	with open(labels_path, encoding="utf-8") as f:
	labels = f.readlines()
	for i, text in enumerate(texts):
	yield i, {"text": text.strip(), "label": int(labels[i].strip())}