mirror of
https://github.com/qurator-spk/page2tsv.git
synced 2025-07-30 04:09:53 +02:00
add drop-columns options to tsv2tsv
This commit is contained in:
parent
198221651f
commit
b9cb04389c
1 changed files with 12 additions and 2 deletions
|
@ -270,9 +270,11 @@ def unicode_normalize(text, normalization_map=None, use_combining_characters=Tru
|
||||||
@click.option('--just-zero', type=bool, is_flag=True, help='Process only files that have max sentence length zero,'
|
@click.option('--just-zero', type=bool, is_flag=True, help='Process only files that have max sentence length zero,'
|
||||||
'i.e., that do not have sentence splitting.')
|
'i.e., that do not have sentence splitting.')
|
||||||
@click.option('--sanitize-sentence-numbers', type=bool, is_flag=True, help='Sanitize sentence numbering.')
|
@click.option('--sanitize-sentence-numbers', type=bool, is_flag=True, help='Sanitize sentence numbering.')
|
||||||
|
@click.option('--show-columns', type=bool, is_flag=True, help='Show TSV columns.')
|
||||||
|
@click.option('--drop-column', type=str, multiple=True, default=[], help="Drop column")
|
||||||
def tsv2tsv(tsv_in_file, tsv_out_file, ner_rest_endpoint, noproxy,
|
def tsv2tsv(tsv_in_file, tsv_out_file, ner_rest_endpoint, noproxy,
|
||||||
num_tokens, sentence_count, max_sentence_len, keep_tokenization, sentence_split_only,
|
num_tokens, sentence_count, max_sentence_len, keep_tokenization, sentence_split_only,
|
||||||
show_urls, just_zero, sanitize_sentence_numbers):
|
show_urls, just_zero, sanitize_sentence_numbers, show_columns, drop_column):
|
||||||
|
|
||||||
if noproxy:
|
if noproxy:
|
||||||
os.environ['no_proxy'] = '*'
|
os.environ['no_proxy'] = '*'
|
||||||
|
@ -286,7 +288,13 @@ def tsv2tsv(tsv_in_file, tsv_out_file, ner_rest_endpoint, noproxy,
|
||||||
print("Input file: {}".format(tsv_in_file))
|
print("Input file: {}".format(tsv_in_file))
|
||||||
|
|
||||||
if show_urls:
|
if show_urls:
|
||||||
print("URLS:{}".format(urls))
|
if urls is None or len(urls) == 0:
|
||||||
|
print("URLS missing!")
|
||||||
|
else:
|
||||||
|
print("URLS:{}".format(urls))
|
||||||
|
|
||||||
|
if show_columns:
|
||||||
|
print("Columns: ", " ".join([ c for c in tsv.columns]))
|
||||||
|
|
||||||
if tsv['No.'].max() > 0 and just_zero:
|
if tsv['No.'].max() > 0 and just_zero:
|
||||||
|
|
||||||
|
@ -360,6 +368,8 @@ def tsv2tsv(tsv_in_file, tsv_out_file, ner_rest_endpoint, noproxy,
|
||||||
|
|
||||||
word_pos += 1
|
word_pos += 1
|
||||||
|
|
||||||
|
tsv_out = tsv_out.drop(columns=[dc for dc in drop_column if dc in tsv_out.columns])
|
||||||
|
|
||||||
write_tsv(tsv_out, urls, contexts, tsv_out_file)
|
write_tsv(tsv_out, urls, contexts, tsv_out_file)
|
||||||
|
|
||||||
print("\n")
|
print("\n")
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue