| import sys |
| import os |
| from datetime import datetime |
| import pandas as pd |
| import contexttimer |
| from urllib.request import urlopen |
| import requests |
| from PIL import Image |
| import torch |
| from torchvision.transforms import functional as TF |
| from multiprocessing import Pool |
| from tqdm import tqdm |
| import logging |
| import sys |
| import numpy as np |
|
|
|
|
|
|
| from nltk.tag import CRFTagger |
| ct = CRFTagger() |
| ct.set_model_file('all_indo_man_tag_corpus_model.crf.tagger') |
|
|
| headers = { |
| "User-Agent": "Googlebot-Image/1.0", |
| "X-Forwarded-For": "64.18.15.200", |
| } |
|
|
| |
| logging.basicConfig(filename='download.log', filemode='w', level=logging.INFO) |
| requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning) |
|
|
| '''if len(sys.argv) != 3: |
| print("Provide .tsv file name & output directory. e.g. python downloader.py Train-GCC-training.tsv training") |
| exit(1)''' |
|
|
| |
| print(f'Starting to load at {datetime.now().isoformat(timespec="minutes")}') |
|
|
| with contexttimer.Timer(prefix="Loading from tsv"): |
| df = pd.read_csv(sys.argv[1], delimiter='\t') |
| df = df[["caption_reference_description", "image_url"]] |
|
|
| def drop_no(text): |
| try: |
| if len(text)==0: |
| return True |
| text = text.split() |
| result = ct.tag_sents([text]) |
| nnp_cnt = 0 |
| total = len(result[0]) |
|
|
| for x in result[0]: |
| if x[1] == "NNP": |
| nnp_cnt += 1 |
| |
| if (nnp_cnt/total)>=0.8: |
| return True |
| return False |
| except Exception as e: |
| print(e) |
| return True |
| |
| df["to_drop"]=df["caption_reference_description"].apply(drop_no) |
| df = df[df["to_drop"]==False] |
| df = df.drop("to_drop",axis=1) |
|
|
| df.to_csv(sys.argv[2], sep='\t') |
|
|
|
|