diff --git a/mtc/challenge_pipelines/calculate_cluster_idx.py b/mtc/challenge_pipelines/calculate_cluster_idx.py deleted file mode 100755 index ee98324..0000000 --- a/mtc/challenge_pipelines/calculate_cluster_idx.py +++ /dev/null @@ -1,28 +0,0 @@ -import os -import numpy as np -import pandas as pd -from sklearn.cluster import KMeans - - -if __name__ == "__main__": - - - NETWORK_SHARE = '/home/klaus/networkdrives/' - # Results: - SET_DIR = 'E132-Projekte/Projects/2019_n2c2_challenge/submission_generation/03_12_2020_20_18_37_original_data/bert_base/' - TSNE_DIR = NETWORK_SHARE + SET_DIR + '1_example_run_step1/preprocessed_data_2020-03-15_21-49-55/' - - test_train_labels = 2 * (1642 * ['Training set'] + 412 * ['Test set']) - train_idx_bool = [t == 'Training set' for t in test_train_labels] - - vectors = np.load(TSNE_DIR + 'tsne_vectors.npy') - projections = np.load(TSNE_DIR + 'tsne_projections.npy') - - k = 10 - kmeans = KMeans(n_clusters=k, random_state=1337).fit(vectors[train_idx_bool]) - - print('Quality of cluster', set(kmeans.labels_[0:1642]-kmeans.labels_[0:1642])) - - test_cluster = [0, 3, 4, 7, 9] - ix = np.isin(kmeans.labels_[0:1642], test_cluster) - np.save(os.path.join(TSNE_DIR, 'test_cluster_idx'), ix) \ No newline at end of file diff --git a/mtc/challenge_pipelines/data_augmentation/__init__.py b/mtc/challenge_pipelines/data_augmentation/__init__.py deleted file mode 100755 index e69de29..0000000 diff --git a/mtc/challenge_pipelines/data_augmentation/augment_data.py b/mtc/challenge_pipelines/data_augmentation/augment_data.py deleted file mode 100755 index fee9bfa..0000000 --- a/mtc/challenge_pipelines/data_augmentation/augment_data.py +++ /dev/null @@ -1,34 +0,0 @@ -import os -from mtc.helpers.nlpaug_fun import NLPAUG -from mtc.helpers.data_augmentation import augment_sentences -from mtc.helpers.file_management import save_augmented_sts_data, load_sts_data -from mtc.settings import NLP_MODELS_PATH, NLP_RAW_DATA - -sts_data = load_sts_data(os.path.join('n2c2', 'clinicalSTS2019.train.txt')) - -number_data = len(sts_data['raw_sentences_a']) -raw_sentences_a = sts_data['raw_sentences_a'][0:number_data] -raw_sentences_b = sts_data['raw_sentences_b'][0:number_data] -scores = sts_data['similarity_score'][0:number_data] - -# numb_translations = 5 -# new_raw_sentences_a = augment_sentences(raw_sentences_a, numb_translations=numb_translations) -# new_raw_sentences_b = augment_sentences(raw_sentences_b, numb_translations=numb_translations) -# new_scores = scores - -nlp_aug = NLPAUG() -new_raw_sentences_a = nlp_aug.augment(raw_sentences_a) -new_raw_sentences_b = nlp_aug.augment(raw_sentences_b) -new_scores = scores - -sts_data['raw_sentences_a'] = sts_data['raw_sentences_a'] + new_raw_sentences_a -sts_data['raw_sentences_b'] = sts_data['raw_sentences_b'] + new_raw_sentences_b -sts_data['similarity_score'] = sts_data['similarity_score'] + new_scores - -print('#####') -print(raw_sentences_a) -print(new_raw_sentences_a) -print('#####') -print(raw_sentences_b) -print(new_raw_sentences_b) -save_augmented_sts_data(sts_data, os.path.join('n2c2', 'clinicalSTS2019.augmented.train.txt')) diff --git a/mtc/challenge_pipelines/data_augmentation/google_trans.py b/mtc/challenge_pipelines/data_augmentation/google_trans.py deleted file mode 100755 index 9841da8..0000000 --- a/mtc/challenge_pipelines/data_augmentation/google_trans.py +++ /dev/null @@ -1,28 +0,0 @@ -import os -os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/home/klaus/repositories/k656s/googleapplicationcredentials/N2C2-2dbc3c91e74b.json" - -def run_quickstart(): - # [START translate_quickstart] - # Imports the Google Cloud client library - from google.cloud import translate - - # Instantiates a client - translate_client = translate.Client() - - # The text to translate - text = u'Hello, world!' - # The target language - target = 'ru' - - # Translates some text into Russian - translation = translate_client.translate( - text, - target_language=target) - - print(u'Text: {}'.format(text)) - print(u'Translation: {}'.format(translation['translatedText'])) - # [END translate_quickstart] - - -if __name__ == '__main__': - run_quickstart() \ No newline at end of file diff --git a/mtc/challenge_pipelines/data_augmentation/microsoft_trans.py b/mtc/challenge_pipelines/data_augmentation/microsoft_trans.py deleted file mode 100755 index fb2a57f..0000000 --- a/mtc/challenge_pipelines/data_augmentation/microsoft_trans.py +++ /dev/null @@ -1,4 +0,0 @@ -from translate import Translator -translator = Translator(to_lang="de") -translation = translator.translate("This is a house.") -print(translation) \ No newline at end of file diff --git a/mtc/helpers/bert_embedding_server_worker.py b/mtc/helpers/bert_embedding_server_worker.py deleted file mode 100755 index 5febb8f..0000000 --- a/mtc/helpers/bert_embedding_server_worker.py +++ /dev/null @@ -1,20 +0,0 @@ -import os - -from bert_serving.server import BertServer -from bert_serving.server.helper import get_args_parser - -from mtc.settings import NLP_MODELS_PATH, ZEROMQ_SOCK_TMP_DIR - -os.environ["ZEROMQ_SOCK_TMP_DIR"] = ZEROMQ_SOCK_TMP_DIR -model_dir = os.path.join(NLP_MODELS_PATH, 'pretrained', 'word_embeddings', 'bert_models', 'uncased_L-24_H-1024_A-16') - -args = get_args_parser().parse_args(['-model_dir', model_dir, - '-port', '5555', - '-port_out', '5556', - '-pooling_strategy', 'CLS_TOKEN', - '-max_seq_len', '128', - #'-graph_tmp_dir', '/home/klaus/private_klaus/bert_trash', - '-show_tokens_to_client', - '-mask_cls_sep']) -server = BertServer(args) -server.start() diff --git a/mtc/helpers/data_augmentation.py b/mtc/helpers/data_augmentation.py deleted file mode 100755 index a555e2e..0000000 --- a/mtc/helpers/data_augmentation.py +++ /dev/null @@ -1,119 +0,0 @@ -import pandas as pd -import random -from segtok.tokenizer import word_tokenizer -from segtok.tokenizer import split_contractions -# from googletrans import Translator -# #translator = Translator() -# -# translator = Translator(service_urls=[ -# 'translate.google.com', -# 'translate.google.co.kr', -# ]) - -from translate import Translator - - -def augment_sentences(raw_sentences, language_list=None, numb_translations=5, original_lang='en'): - if language_list is None: - # language_list = [key for key, item in googletrans.LANGUAGES.items()] - language_list = ['fr', 'de', 'en', 'es', 'it', 'pt', 'ru', 'pl', 'no'] - language_list.remove(original_lang) - if numb_translations is None: - numb_translations = 5 - start = 0 - end = len(language_list) - 1 - - translation_list = random.sample(range(start, end), numb_translations) - - translator_list = [] - translator_list.append(Translator(from_lang=original_lang, to_lang=language_list[translation_list[0]])) - for tra_src, tra_dest in zip(translation_list[:-1], translation_list[1:]): - translator_list.append(Translator(from_lang=language_list[tra_src], to_lang=language_list[tra_dest])) - translator_list.append( Translator(from_lang=language_list[translation_list[-1]], to_lang=original_lang)) - - new_raw_sentences = [] - for idx, original_text in enumerate(raw_sentences): - translation_list = random.sample(range(start, end), numb_translations) - - trans_text = original_text - for translator in translator_list: - trans_text = translator.translate(trans_text) - - original_splitted = split_contractions(word_tokenizer(original_text)) - trans_splitted = split_contractions(word_tokenizer(trans_text)) - if original_splitted != trans_splitted: - new_raw_sentences.append(trans_text) - - print(idx) - return new_raw_sentences - -def augment_sentences_backup(raw_sentences, language_list=None, numb_translations=5, original_lang='en'): - if language_list is None: - # language_list = [key for key, item in googletrans.LANGUAGES.items()] - language_list = ['fr', 'de', 'en', 'es', 'it', 'pt', 'ru', 'pl', 'no'] - language_list.remove(original_lang) - if numb_translations is None: - numb_translations = 5 - start = 0 - end = len(language_list) - 1 - - new_raw_sentences = [] - for idx, original_text in enumerate(raw_sentences): - - - translation_list = random.sample(range(start, end), numb_translations) - - trans_text = original_text - - translator = Translator(from_lang=original_lang, to_lang=language_list[translation_list[0]]) - trans_text = translator.translate(trans_text) - - for tra_src, tra_dest in zip(translation_list[:-1], translation_list[1:]): - translator = Translator(from_lang=language_list[tra_src], to_lang=language_list[tra_dest]) - trans_text = translator.translate(trans_text) - #print(trans_text) - translator = Translator(from_lang=language_list[translation_list[-1]], to_lang=original_lang) - trans_text = translator.translate(trans_text) - original_splitted = split_contractions(word_tokenizer(original_text)) - trans_splitted = split_contractions(word_tokenizer(trans_text)) - if original_splitted != trans_splitted: - new_raw_sentences.append(trans_text) - - print(idx) - return new_raw_sentences - - - -if __name__ == '__main__': - EXAMPLE_SENTENCES = [ - 'terminal 1 is connected to the negative battery terminal', - 'there is no gap between terminal 6 and the positive terminal', - 'bulb a is still contained in a closed path with the battery .', - 'each bulb is in its own path', - 'a non-zero voltage means that the terminals are not connected .', - 'bulb a was still contained in the same closed path with the battery .', - 'bulb a was still contained in the same closed path with the battery .', - 'terminals 1 , 2 and 3 are separated from the positive battery terminal by a gap', - 'terminal 6 is separated by a gap from the negative battery terminal', - 'if a bulb and a switch are in the same path the switch affects the bulb', - 'a , b and c are in different paths', - 'a battery uses a chemical reaction to maintain different electrical states at the terminals', - 'the terminals are not connected', - 'bulb b is in a separate path', - 'bulb a is still contained in a closed path with the battery and switch z .', - 'bulb a was still contained in the same closed path with the battery .', - 'bulb a is still in a closed path with the battery', - 'there is no gap between terminal 6 and the positive terminal', - 'bulb c was not in a closed path', - 'a battery uses a chemical reaction to maintain different electrical states at the terminals', - 'terminals 1 , 2 and 3 are separated from the positive battery terminal by a gap', - 'the open switch creates a gap', - 'a and c are in the same closed path', - ] - - EXAMPLE_SENTENCES = [ - 'Kein Nachweis pathologisch vergrößerter Lymphknoten mediastinal und axillär beidseits.' - ] - new_sentences = augment_sentences(EXAMPLE_SENTENCES, original_lang='de') - - print(new_sentences) diff --git a/mtc/helpers/util.py b/mtc/helpers/util.py index 9d48996..9c35ab1 100755 --- a/mtc/helpers/util.py +++ b/mtc/helpers/util.py @@ -1,98 +1,92 @@ from typing import List, Dict, Union import re import json import torch class LoadedModels: """ Class that caches the loaded models """ @staticmethod def _get_key(name, *args, **kwargs): return f'{name}_{str(args)}_{str(kwargs)}' def __init__(self): self.models = dict() def get_class_instance(self, cls, name, *args, **kwargs): if not self._key_exists(name, *args, **kwargs): print('loading', self._get_key(name, *args, **kwargs)) self._add_model(cls(*args, **kwargs), name, *args, **kwargs) return self._get_model(name, *args, **kwargs) def _key_exists(self, name, *args, **kwargs): if self._get_key(name, *args, **kwargs) not in self.models.keys(): return False else: return True def _add_model(self, model_instance, name, *args, **kwargs): self.models.update({self._get_key(name, *args, **kwargs): model_instance}) def _get_model(self, name, *args, **kwargs): return self.models[self._get_key(name, *args, **kwargs)] class PipelineDictArgument(dict): def __init__(self, *args, **kwargs): if args is None: args = [] if kwargs is None: kwargs = {} super().__init__({ 'args': args, 'kwargs': kwargs }) def __str__(self): return json.dumps(self) def chunks(l, n): """Yield successive n-sized chunks from l.""" for i in range(0, len(l), n): yield l[i:i + n] def get_token_embedding_by_names(token, token_names): """ Adapted flair.data.Token get_embedding() :param token: :param token_names: :return: """ token_embeddings = [ token._embeddings[embed] for embed in sorted(token_names) ] if token_embeddings: return torch.cat(token_embeddings, dim=0) else: return torch.Tensor() def get_med(sentence): match = re.search(r'\[([^]]+)\]', sentence) if match: return match.group(1) else: return '' def get_ingredient_booleans(raw_sentences: List[str]): ingredient_booleans = [] for raw_sentence in raw_sentences: match = re.search(r'\[([^]]+)\]', raw_sentence) ingredient_booleans.append(match) return ingredient_booleans - -if __name__ == '__main__': - b = PipelineDictArgument('st', b=['klaus', 'tina']) - c = PipelineDictArgument('hd', b=['lukas', 'laura']) - a = PipelineDictArgument('a', [c,b], b='c', c=[c, b]) - print(str(a))