From b9cb04389c90894f69f473b9a45e57b252189a3f Mon Sep 17 00:00:00 2001
From: Kai Labusch <Kai.Labusch@sbb.spk-berlin.de>
Date: Wed, 26 Mar 2025 15:16:41 +0100
Subject: [PATCH] add drop-columns options to tsv2tsv

---
 qurator/tsvtools/cli.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/qurator/tsvtools/cli.py b/qurator/tsvtools/cli.py
index e14a33f..053aa84 100644
--- a/qurator/tsvtools/cli.py
+++ b/qurator/tsvtools/cli.py
@@ -270,9 +270,11 @@ def unicode_normalize(text, normalization_map=None, use_combining_characters=Tru
 @click.option('--just-zero', type=bool, is_flag=True, help='Process only files that have max sentence length zero,'
                                                            'i.e., that do not have sentence splitting.')
 @click.option('--sanitize-sentence-numbers', type=bool, is_flag=True, help='Sanitize sentence numbering.')
+@click.option('--show-columns', type=bool, is_flag=True, help='Show TSV columns.')
+@click.option('--drop-column', type=str, multiple=True, default=[], help="Drop column")
 def tsv2tsv(tsv_in_file, tsv_out_file, ner_rest_endpoint, noproxy,
             num_tokens, sentence_count, max_sentence_len, keep_tokenization, sentence_split_only,
-            show_urls, just_zero, sanitize_sentence_numbers):
+            show_urls, just_zero, sanitize_sentence_numbers, show_columns, drop_column):
 
     if noproxy:
         os.environ['no_proxy'] = '*'
@@ -286,7 +288,13 @@ def tsv2tsv(tsv_in_file, tsv_out_file, ner_rest_endpoint, noproxy,
     print("Input file: {}".format(tsv_in_file))
 
     if show_urls:
-        print("URLS:{}".format(urls))
+        if urls is None or len(urls) == 0:
+            print("URLS missing!")
+        else:
+            print("URLS:{}".format(urls))
+
+    if show_columns:
+        print("Columns: ", " ".join([ c for c in tsv.columns]))
 
     if tsv['No.'].max() > 0 and just_zero:
 
@@ -360,6 +368,8 @@ def tsv2tsv(tsv_in_file, tsv_out_file, ner_rest_endpoint, noproxy,
 
             word_pos += 1
 
+    tsv_out = tsv_out.drop(columns=[dc for dc in drop_column if dc in tsv_out.columns])
+
     write_tsv(tsv_out, urls, contexts, tsv_out_file)
 
     print("\n")