PDFファイル中の文書情報の取得

PDFファイル中には文書のタイトルや著作者情報が含まれている.その情報の記述方法にはXMPによるXMLの記述が含まれており,<xpacket>で始まり,</xpacket>で始まる記述を文書中から抽出し,XMLファイルとしてMSXML等でパースすれば文書情報を抽出することができます.
しかしながら,XMPによる記述が記入されていないことが大半であったために別の方法はないかなとPDFファイルを見てみると,文書中にタイトル(/Title)や著者(/Author)の情報がそのままテキストで記入されていることに気付き,その文字列のみを抜き出す例題を作成しました.抜き出したテキストはUNICODE化されているので,WideCharToMultiByte()の関数によるワイドバイト文字列からマルチバイト文字列へ変換しています.この関数はwindows.h内の記述に依存するのでWindowsでしか利用できません.Windows依存でない環境であれば

size_t wcstombs(char *dest, const wchar_t *src, size_t n);

という関数がありますが,文字化けして使えませんでした.以下のソース内のwcs2sjis()関数内の記述を修正すればUNIX環境等でも利用できるものができます (私の場合は以下の内容で満足なのでお任せします^^).

例題ファイル: src.pdf

PDFファイル中の文書情報の取得 (list_89.c)
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <windows.h>

#define DUMP

char toescape(char c)
{
    switch(c) {
    case '7': return '\7';
    case '(':  return '(';
    case ')':  return ')';
    case 't':  return '\t';
    case 'n':  return '\n';
    case 'r':  return '\r';
    case '\\': return '\\';
    case '\"': return '\"';
    }
    return c;
}

char todigit(char c)
{
    char d = toupper(c);
    if(d >= '0' && d <= '9') d = d - '0';
    else d = d - 'A' + 10;
    return d;
}

int isescape(char c)
{
    if(c == '(' || c == ')' || c == 't' || c == 'n' || c == 'r' || c == '\\' || c == '\"') return 1;
    return 0;
}

int findString(char* str, char* find)
{
    for(unsigned int i=0; i<strlen(str); i++){
        if(!strncmp(str+i,find,strlen(find))) return i;
    }
    return -1;
}

void dump(unsigned char* str, int sn)
{
    for(int i=0; i<sn; i++){ printf("%02x, ",str[i]); }
    printf(" : %s \n",str);
}

#define BOM ((wchar_t)0xFEFF) // UTF-16 BOM

int isEnglish(char* str, int sn)
{
    for(int i=0; i<sn; i++){ if(!isascii(str[i])) return 0; }
    return 1;
}

void wcs2sjis(char* str, char* sjis, int sn)
{
    if(isEnglish(str,sn)){
        memset(sjis,0,sizeof(sjis));
        strncpy(sjis,str,strlen(str)+1);
        return;
    }
    wchar_t string[2048];
    wchar_t *dest = string;
    for(int i=0; i<sn; i++){
        wchar_t wc = (wchar_t)(((unsigned char)str[i] << 8) | (unsigned char)str[i+1]);
        if(wc != BOM) *dest++ = wc;
        i++;
    }
    *dest = L'\0';
    memset(sjis,0,sizeof(sjis));
    int nLen = ::WideCharToMultiByte(CP_ACP, 0, string, -1, NULL, 0, NULL, NULL );
    ::WideCharToMultiByte(CP_ACP, 0, string, -1, sjis, nLen, NULL, NULL );
}


int getPDFString(char* dst, char* src, int si, char* find)
{
    int b_esc = 0,ci = 0;
    int b_start = 0;
    memset(dst,0,sizeof(dst));
    for(unsigned int i=si+strlen(find); i<strlen(src); i++){ if(src[i] != ' ') { si = i; break; } }
    for(unsigned int i=si; i<strlen(src); i++){
        if(src[i] == '(') { b_start = 1; si = i+1; break; }
        else if(src[i] == '<') { b_start = 2; si = i+1; break; }
        else if(src[i] != '(' && src[i] != '<') { si = i; break; }
        
    }
    int iskip = (b_start == 2) ? 2 : 1;
    for(unsigned int i=si; i<strlen(src); i+=iskip){
        if(!b_esc && src[i] == '\\'){ b_esc = 1; continue; }
        if(b_esc){
            b_esc = 0;
            if(isescape(src[i])){
                dst[ci++] = toescape(src[i]);
#ifdef DUMP
                printf("[\\%c], ",src[i]);
#endif
                continue; }
            unsigned int ii, di = 0;
            char dv[10]; memset(dv,0,sizeof(dv));
            int b_flag = 0;
            for(ii=i; b_flag != 1; ii++){
                if(!(src[ii] >= '0' && src[ii] <= '9')) { b_flag = 1; continue; }
                if(ii == i+3){ b_flag = 1; continue; }
                dv[di++] = src[ii];
            }
            char *es;
            int base8 = strtol(dv,&es,8);
            dst[ci++] = base8;
            printf("[\\%o], ",base8);
            i = ii-2;
            continue;
        }
        if(src[i] == ')' && b_start == 1) break;
        if(src[i] == '>' && b_start == 2) break;
        if((src[i] == ' ' || src[i] == '/') && b_start == 0) break;
        if(b_start == 2) {
            if(src[i] == '0' && src[i] == '0') { dst[ci++] = 0; printf("0, "); continue; }    
            dst[ci++] = (todigit(src[i])<<4) + todigit(src[i+1]);
#ifdef DUMP
            printf("([%d,%02x,%c]: %c%c), ",(unsigned char)dst[ci-1],(unsigned char)dst[ci-1],dst[ci-1],src[i],src[i+1]);
#endif
        } else {
            dst[ci++] = src[i];
#ifdef DUMP
            printf("%c[%02x], ",src[i],(unsigned char)src[i]);
#endif
        }
    }
#ifdef DUMP
    printf("\n");
#endif
    return ci;
}

int main()
{
    const int BUFSIZE = 2048;
    FILE* fp = fopen("src.pdf","rb");
    char readbuf[BUFSIZE];
    if(fp == NULL) return 0;
    int count = 0;
    int b_title = 0, b_author = 0, b_count = 0;
    char title[BUFSIZE],author[BUFSIZE],scount[BUFSIZE];
    char title_s[BUFSIZE],author_s[BUFSIZE],scount_s[BUFSIZE];
    memset(readbuf,0,sizeof(readbuf));
    memset(title,0,sizeof(title));
    memset(author,0,sizeof(author));
    memset(scount,0,sizeof(scount));
    memset(title_s,0,sizeof(title));
    memset(author_s,0,sizeof(author));
    memset(scount_s,0,sizeof(scount_s));

    while(fgets(readbuf,BUFSIZE,fp)){
        int si = 0;
        if((si = findString(readbuf,"/Title")) >= 0 && b_title == 0){
            int cn = getPDFString(title,readbuf,si,"/Title");
            wcs2sjis(title,title_s,cn);
#ifdef DUMP
            printf("title: "); dump((unsigned char*)title,cn);
#endif
            b_title = 1;
        }
        if((si = findString(readbuf,"/Author")) >= 0 && b_author == 0){
            int cn = getPDFString(author,readbuf,si,"/Author");
            wcs2sjis(author,author_s,cn);
#ifdef DUMP
            printf("author: "); dump((unsigned char*)author,cn);
#endif
            b_author = 1;
        }
        if((si = findString(readbuf,"/N ")) >= 0 && b_count == 0){
            int cn = getPDFString(scount,readbuf,si,"/N ");
            if(strlen(scount) < 4){
#ifdef DUMP
                printf("count: "); dump((unsigned char*)scount,cn);
#endif
                count = atoi(scount);
                b_count = 1;
            }
        }
        if((si = findString(readbuf,"/Count")) >= 0 && b_count == 0){
            int cn = getPDFString(scount,readbuf,si,"/Count");
#ifdef DUMP
            printf("count: "); dump((unsigned char*)scount,cn);
#endif
            count = atoi(scount);
            b_count = 1;
        }
        if(b_title && b_author && b_count) break;
        memset(readbuf,0,sizeof(readbuf));
    }
    fclose(fp);
    
    if(b_title) printf("title is %s\n",title_s);
    if(b_author) printf("author is %s\n",author_s);
    if(b_count) printf("count is %d\n",count);

    return 0;
}

実行結果
Gami[238]% ./list_89.exe
2[32], [0d], 
[0a], 
count: 32, 0d, 0a,  : 2

a[61], -[2d], g[67], a[61], m[6d], y[79], l[6c], 
author: 61, 2d, 67, 61, 6d, 79, 6c,  : a-gamyl 
a[61], 2[32], p[70], d[64], f[66], 
title: 61, 32, 70, 64, 66,  : a2pdf 
title is a2pdf
author is a-gamyl
count is 2
Gami[1206]%
inserted by FC2 system