SoFunction
Updated on 2025-03-07

C# programmatic method of reading document Doc, Docx and Pdf content

This article describes the C# programmatic method of reading documents Doc, Docx and Pdf content. Share it for your reference. The specific analysis is as follows:

Doc Document: Microsoft Word 14.0 Object Library (GAC object, you need to install word before calling. Different versions of word installed, the version number of COM will also be different)
Docx Documentation: Microsoft Word 14.0 Object Library (GAC object, you need to install word before calling. Different versions of word installed, and the version number of COM will also be different)
PDF Document: PDFBox

/*
  Author: GhostBear
  */
using System;
using ;
using ;
using ;
using ;
using ;
using ;
using ;
using ;
namespace TestPdfReader
{
 class Program
 {
 static void Main(string[] args)
 {
  //PDF
  PDDocument doc = (@"C:\");
  PDFTextStripper pdfStripper = new PDFTextStripper();
  string text = (doc);
  string result = ('\t', ' ').Replace('\n', ' ').Replace('\r', ' ').Replace(" ", "");
  (result);
  //Doc,Docx
  object docPath = @"C:\";
  object docxPath = @"C:\";
  object missing=;
  object readOnly=true;
  Application wordApp;
  wordApp = new Application();
  Document wordDoc = (ref docPath,
       ref missing,
       ref readOnly,
       ref missing,
       ref missing,
       ref missing,
       ref missing,
       ref missing,
       ref missing,
       ref missing,
       ref missing,
       ref missing,
       ref missing,
       ref missing,
       ref missing,
       ref missing);
  string text2 = FilterString();
  (ref missing, ref missing, ref missing);
  (ref missing, ref missing, ref missing);
  (text2);
  ();
  
 }
 private static string FilterString(string input)
 {
  return (input, @"(\a|\t|\n|\s+)", "");
 }
 }
}

I hope this article will be helpful to everyone's C# programming.