Genderqueer Marquis(e)

Computational Methods of Authorship Attribution for A French Tale

Python Notebooks for Humanities Scholars

Translation Notebook

Download ipynb file here.

Basic installation needs:

!pip install deep-translator !pip install PyPDF2 # code below is only if you are using Google Colab from google.colab import drive drive.mount('/content/drive')

Convert pdf into plain text (optional):

from PyPDF2 import PdfReader # Function to extract text from a PDF file def extract_text_from_pdf(pdf_path): reader = PdfReader(pdf_path) text = "" for page_num, page in enumerate(reader.pages, start=1): page_text = page.extract_text() if page_text: # Check if text is extracted text += page_text else: print(f"Warning: No text found on page {page_num}") return text # Define the path to your PDF file and output text file pdf_path = 'path/to/input/pdf/file.pdf' output_file = 'path/to/pdf/output.txt' # Extract text from the PDF pdf_text = extract_text_from_pdf(pdf_path) # Save the extracted text to a .txt file with open(output_file, 'w', encoding='utf-8') as file: file.write(pdf_text) print("📄Text extraction complete!📄")

Split text into manageable chunks:

def split_text_file(input_file): # You can change the max_length parameter to split the file differently max_length = 4000 with open(input_file, 'r', encoding='utf-8') as file: text = file.read() chunks = [text[i:i+max_length] for i in range(0, len(text), max_length)] # Use os.path.splitext to split the filename and extension properly file_base_name = os.path.splitext(input_file)[0] for index, chunk in enumerate(chunks): output_file_name = f"{file_base_name}_{index + 1}.txt" with open(output_file_name, 'w', encoding='utf-8') as file: file.write(chunk) # Replace 'file.txt' with the path to your .txt file split_text_file('path/to/input/text/file.txt') print(f"✂️Successfully created TXT files!✂️")

The translation itself:

import os from deep_translator import GoogleTranslator # Set up the translator: French to English translator = GoogleTranslator(source='fr', target='en') # Define the folder containing the .txt files input_folder = 'path/to/input/folder' output_folder = 'path/to/output/folder' # Ensure the output folder exists if not os.path.exists(output_folder): os.makedirs(output_folder) # Function to translate text from a file def translate_file(input_file, output_file): with open(input_file, 'r', encoding='utf-8') as file: text = file.read() # Translate the text translated_text = translator.translate(text) # Write the translated text to a new file with open(output_file, 'w', encoding='utf-8') as file: file.write(translated_text) # Loop through all files in the input folder for filename in os.listdir(input_folder): if filename.endswith('.txt'): input_file = os.path.join(input_folder, filename) output_file = os.path.join(output_folder, f'translated_{filename}') # Translate each file translate_file(input_file, output_file) print("📝Translation complete!📝")

Concatenate the results into a single txt file (optional)

# Combining translated text into a single file # Define the folder containing the .txt files input_folder = 'path/to/input/folder' output_file = 'path/to/output/file.txt' # Open the output file in write mode with open(output_file, 'w', encoding='utf-8') as outfile: # Loop through all files in the input folder for filename in os.listdir(input_folder): if filename.endswith('.txt'): input_file = os.path.join(input_folder, filename) # Open each file and append its content to the output file with open(input_file, 'r', encoding='utf-8') as infile: outfile.write(infile.read()) outfile.write("\n") # Add a newline between files print("📚Concatenation complete!📚")

↑ Return to top