configs/data_juicer_recipes/pile-uspto-refine.yaml

# global parameters
project_name: 'Data-Juicer-recipes-uspto'
dataset_path: '/path/to/your/dataset'  # path to your dataset directory or file
export_path: '/path/to/your/dataset.jsonl' # path to your dataset result file

np: 50  # number of subprocess to process your dataset
open_tracer: true

# process schedule
# a list of several process operators with their arguments
process:
  - clean_email_mapper:
  - clean_links_mapper:
  - fix_unicode_mapper:
  - punctuation_normalization_mapper:
  - whitespace_normalization_mapper:

  - alphanumeric_filter:
      tokenization: false
      min_ratio: 0.7  # <3sigma (0.758)
  - average_line_length_filter:  # for code
      max_len: 2000  # >3sigma (1307)
  - character_repetition_filter:
      rep_len: 10
      max_ratio: 0.2  # >3sigma (0.189)
  - flagged_words_filter:
      lang: en
      tokenization: true
      max_ratio: 0.0016  # 3sigma
  - language_id_score_filter:
      min_score: 0.6
  - maximum_line_length_filter:  # for code
      max_len: 3061  # 3sigma
  - perplexity_filter:
      lang: en
      max_ppl: 4000  # 3sigma
  - special_characters_filter:
      max_ratio: 0.3  # > 3sigma (0.274)
  - text_length_filter:
      max_len: 21556 # 3sigma
  - words_num_filter:
      lang: en
      tokenization: true
      min_num: 100
      max_num: 6000  # 3sigma
  - word_repetition_filter:
      lang: en
      tokenization: true
      rep_len: 10
      max_ratio: 0.169  # 3sigma

  - document_simhash_deduplicator:
      tokenization: space
      window_size: 6
      lowercase: true
      ignore_pattern: '\p{P}'
      num_blocks: 6
      hamming_distance: 4