【Python PHP】Google スプレッドシートのurlからタイトルとmeta descriptionを取得する

Google スプレッドシートに書かれているurlからタイトルとmeta descriptionを取得するの続きです。他の言語、PythonとPHPでも同じことをやってみます。

Contents

1 Pythonの場合
2 PHPの場合

Pythonの場合

Pythonでスプレッドシートのデータの読み書きとHTMLの解析をするのであれば、準備としてpipコマンドで以下をインストールします。

# スプレッドシートの操作関連
pip install --upgrade google-api-python-client google-auth-httplib2 google-auth-oauthlib
pip install --upgrade oauth2client --ignore-installed six
pip install google-cloud-storage

# スクレイピング関連
pip install beautifulsoup4
pip install requests

# スプレッドシートの操作関連

pip install --upgrade google-api-python-client google-auth-httplib2 google-auth-oauthlib

pip install --upgrade oauth2client --ignore-installed six

pip install google-cloud-storage

# スクレイピング関連

pip install beautifulsoup4

pip install requests

そのあと必要なモジュールをインポートして関数を定義します。

app.py

from googleapiclient.discovery import build
from oauth2client.service_account import ServiceAccountCredentials

import requests
from bs4 import BeautifulSoup

# シートのデータを取得する
def get_values(service, spreadsheetId, range):
    sheet = service.spreadsheets()
    result = (
        sheet.values().get(spreadsheetId=spreadsheetId, range=range).execute()
    )
    return result.get("values", [])

# シートにデータを書き込む
def update_values(service, spreadsheetId, range, values):
    body = {"values": values}
    result = (
        service.spreadsheets().values()
        .update(
            spreadsheetId=spreadsheetId,
            range=range,
            valueInputOption="USER_ENTERED",
            body=body,
        )
        .execute()
    )
    return result

from googleapiclient.discovery import build

from oauth2client.service_account import ServiceAccountCredentials

import requests

from bs4 import BeautifulSoup

# シートのデータを取得する

def get_values(service, spreadsheetId, range):

sheet = service.spreadsheets()

result = (

sheet.values().get(spreadsheetId=spreadsheetId, range=range).execute()

)

return result.get("values", [])

# シートにデータを書き込む

def update_values(service, spreadsheetId, range, values):

body = {"values": values}

result = (

service.spreadsheets().values()

.update(

spreadsheetId=spreadsheetId,

range=range,

valueInputOption="USER_ENTERED",

body=body,

)

.execute()

)

return result

定数を宣言します。このシートに書かれているurlを取得して結果を書き込みたいので、SPREADSHEET_IDとRANGEを以下のように定義します。シートの読み取りだけでなく書き込みもしたいので、SCOPES は [“https://www.googleapis.com/auth/spreadsheets”]にしています。

最後のsecret.jsonはGoogle スプレッドシートに書かれているurlからタイトルとmeta descriptionを取得するでサービスアカウントを作成したときにダウンロードした鍵です。

SPREADSHEET_ID = "1PCAzttMI2r_Cwy9w9iTwLAKY6xiNcpGMrxItCvnlmUY"
RANGE = 'Pythonでサイト情報を取得!A2:C'
SCOPES = ["https://www.googleapis.com/auth/spreadsheets"]
SECRET_JSON = 'secret.json'

SPREADSHEET_ID = "1PCAzttMI2r_Cwy9w9iTwLAKY6xiNcpGMrxItCvnlmUY"

RANGE = 'Pythonでサイト情報を取得!A2:C'

SCOPES = ["https://www.googleapis.com/auth/spreadsheets"]

SECRET_JSON = 'secret.json'

あとは定義した関数と定数から以下のようなコードを書きます。シートに書かれているデータは配列の配列で返されますが、空白のセルがあると配列のなかの配列の要素数が異なる場合があります。書き込むときは同じにしたいので新しい変数newValuesを宣言してそこに新しい値をセットしています。

URLとして渡された文字列が空文字であったり、URLではない文字列であったり、存在しないページを参照するURLであることが考えられるので例外処理をしています。

creds = ServiceAccountCredentials.from_json_keyfile_name(SECRET_JSON, SCOPES)
service = build("sheets", "v4", credentials=creds)

values = get_values(service, SPREADSHEET_ID, RANGE)
print('取得件数 = ' + str(len(values)))

newValues = []
for row in values:
    url = ''
    title = ''
    description = ''
    try:
        url = row[0]
        response = requests.get(url)
        html = response.content # response.text だと文字化けすることがある
        soup = BeautifulSoup(html, "html.parser")
        if soup.find("title") != None:
            title = soup.find("title").text
        else:
            title = 'no title'

        meta = soup.find('meta', attrs={'name': 'description'})
        if meta != None:
            description = meta['content']
        else:
            description = 'no description'

    except Exception as err:
        if url == '':
            title = 'blank'
            description = 'blank'
        else:
            title = 'error'
            description = 'error'

    newValues.append([url, title, description])
    print('')
    print('url = ' + url)
    print('title = ' + title)
    print('description = ' + description)

update_values(service, SPREADSHEET_ID, RANGE, newValues)
print('終了')

creds = ServiceAccountCredentials.from_json_keyfile_name(SECRET_JSON, SCOPES)

service = build("sheets", "v4", credentials=creds)

values = get_values(service, SPREADSHEET_ID, RANGE)

print('取得件数 = ' + str(len(values)))

newValues = []

for row in values:

url = ''

title = ''

description = ''

try:

url = row[0]

response = requests.get(url)

html = response.content # response.text だと文字化けすることがある

soup = BeautifulSoup(html, "html.parser")

if soup.find("title") != None:

title = soup.find("title").text

else:

title = 'no title'

meta = soup.find('meta', attrs={'name': 'description'})

if meta != None:

description = meta['content']

else:

description = 'no description'

except Exception as err:

if url == '':

title = 'blank'

description = 'blank'

else:

title = 'error'

description = 'error'

newValues.append([url, title, description])

print('')

print('url = ' + url)

print('title = ' + title)

print('description = ' + description)

update_values(service, SPREADSHEET_ID, RANGE, newValues)

print('終了')

PHPの場合

PHPでスプレッドシートのデータの読み書きとHTMLの解析をするのであれば、準備としてcomposerで以下をインストールします。

composer require google/apiclient:^2.0

1	composer require google/apiclient:^2.0

そのあと以下のコードを書きます。

まずrequire_onceでcomposerでインストールしたファイルを読み込みます。そのあと各関数を定義します。

getService関数は認証情報からスプレッドシートにアクセスするためのサービスを作成します。getData関数はシートから読み出したデータのA列からwebページにアクセスしてタイトルとdescriptionを取得します。getValuesForUpdate関数はgetData関数が返した値からスプレッドシートを更新するためのデータを作成します。

app.php

<?php
require_once __DIR__.'/vendor/autoload.php';

function getService(){
    $client = new Google\Client();
    $client->setAuthConfig('secret.json');
    $client->setScopes(Google_Service_Sheets::SPREADSHEETS);

    $client->setScopes([
        Google_Service_Sheets::SPREADSHEETS, // スプレッドシート
    ]);
    return new Google_Service_Sheets($client);
}

function getData($row){
    $url = '';
    $title = '';
    $description = '';

    if(count($row) == 0 || $row[0] == ''){ // この行は存在しない場合もあるし、A列だけ空欄の場合もある
        $title = 'blank';
        $description = 'blank';
        return array($url, $title, $description);
    }
    $url = $row[0];
    $html = @file_get_contents($url); //データを抽出したいURLを入力する
    if($html !== false){
        $dom = new DOMDocument('1.0', 'UTF-8'); //DOMオブジェクトを新規に作成する
        @$dom->loadHTML($html); //$htmlに指定したHTMLの内容を$domに取り込む
        $xpath = new DOMXpath($dom);

        $item = $xpath->query("//title")->item(0);
        if($item != null)
            $title = $item->nodeValue; //タイトルを抽出
        else
            $title = "no title"; //タイトルが存在しない

        $item = $xpath->query("//meta[@name='description']/@content")->item(0);
        if($item != null)
            $description = $item->nodeValue; //descriptionを抽出
        else
            $description = "no description"; // descriptionが存在しない
    }
    else {
        $title = 'error';
        $description = 'error';
    }
    return array($url, $title, $description);
}

function getValuesForUpdate($rows) {
    $arr = array();

    foreach ($rows as $row) {
        array_push($arr, getData($row));
    }
    return new Google_Service_Sheets_ValueRange([ 'values' => $arr ]);
}

<?php

require_once __DIR__.'/vendor/autoload.php';

function getService(){

$client = new Google\Client();

$client->setAuthConfig('secret.json');

$client->setScopes(Google_Service_Sheets::SPREADSHEETS);

$client->setScopes([

Google_Service_Sheets::SPREADSHEETS, // スプレッドシート

]);

return new Google_Service_Sheets($client);

}

function getData($row){

$url = '';

$title = '';

$description = '';

if(count($row) == 0 || $row[0] == ''){ // この行は存在しない場合もあるし、A列だけ空欄の場合もある

$title = 'blank';

$description = 'blank';

return array($url, $title, $description);

}

$url = $row[0];

$html = @file_get_contents($url); //データを抽出したいURLを入力する

if($html !== false){

$dom = new DOMDocument('1.0', 'UTF-8'); //DOMオブジェクトを新規に作成する

@$dom->loadHTML($html); //$htmlに指定したHTMLの内容を$domに取り込む

$xpath = new DOMXpath($dom);

$item = $xpath->query("//title")->item(0);

if($item != null)

$title = $item->nodeValue; //タイトルを抽出

else

$title = "no title"; //タイトルが存在しない

$item = $xpath->query("//meta[@name='description']/@content")->item(0);

if($item != null)

$description = $item->nodeValue; //descriptionを抽出

else

$description = "no description"; // descriptionが存在しない

}

else {

$title = 'error';

$description = 'error';

}

return array($url, $title, $description);

}

function getValuesForUpdate($rows) {

$arr = array();

foreach ($rows as $row) {

array_push($arr, getData($row));

}

return new Google_Service_Sheets_ValueRange([ 'values' => $arr ]);

}

上記の関数を利用して処理をおこないます。スプレッドシートのIDとシート名と範囲からデータを取得します。そして該当ページをスクレイピングしてタイトルとdescriptionを取得してこれをスプレッドシートに書き込みます。

app.php

$service = getService();
$spreadsheet_id = '1PCAzttMI2r_Cwy9w9iTwLAKY6xiNcpGMrxItCvnlmUY';
$sheet_range = 'PHPでサイト情報を取得!A2:C';

$rows = $service->spreadsheets_values->get($spreadsheet_id, $sheet_range)->getValues();
$body = getValuesForUpdate($rows);
$service->spreadsheets_values->update($spreadsheet_id, $sheet_range, $body, [ 'valueInputOption' => 'USER_ENTERED']);
echo '完了<br>';

$service = getService();

$spreadsheet_id = '1PCAzttMI2r_Cwy9w9iTwLAKY6xiNcpGMrxItCvnlmUY';

$sheet_range = 'PHPでサイト情報を取得!A2:C';

$rows = $service->spreadsheets_values->get($spreadsheet_id, $sheet_range)->getValues();

$body = getValuesForUpdate($rows);

$service->spreadsheets_values->update($spreadsheet_id, $sheet_range, $body, [ 'valueInputOption' => 'USER_ENTERED']);

echo '完了<br>';

【Python PHP】Google スプレッドシートのurlからタイトルとmeta descriptionを取得する

Pythonの場合

PHPの場合

鳩でも分かるC#管理人からのお願い

コメントについて

管理人のモチベーションアップのために

コメントをどうぞ